From be844a76f4d4ed3927fcb240a36c6d52bb3908a3 Mon Sep 17 00:00:00 2001
From: Kal Conley <kal.conley@dectris.com>
Date: Fri, 4 Aug 2023 03:17:31 +0200
Subject: [PATCH] Initial commit

---
 .bazelrc                      |   52 +
 .bazelversion                 |    1 +
 .clang-format                 |    9 +
 .clang-tidy                   |   15 +
 .github/workflows/main.yml    |  256 +++++
 .gitignore                    |    1 +
 BUILD.bazel                   |   43 +
 LICENSE-APACHE                |  202 ++++
 LICENSE-MIT                   |   19 +
 README.md                     |  134 +++
 WORKSPACE                     |   23 +
 src/bitshuffle.c              | 1709 +++++++++++++++++++++++++++++++++
 src/bitshuffle.h              |   68 ++
 tests/bitshuffle_benchmark.cc |   75 ++
 tests/bitshuffle_test.cc      |  146 +++
 third_party/BUILD.bazel       |    0
 third_party/bitshuffle.BUILD  |   16 +
 17 files changed, 2769 insertions(+)
 create mode 100644 .bazelrc
 create mode 100644 .bazelversion
 create mode 100644 .clang-format
 create mode 100644 .clang-tidy
 create mode 100644 .github/workflows/main.yml
 create mode 100644 .gitignore
 create mode 100644 BUILD.bazel
 create mode 100644 LICENSE-APACHE
 create mode 100644 LICENSE-MIT
 create mode 100644 README.md
 create mode 100644 WORKSPACE
 create mode 100644 src/bitshuffle.c
 create mode 100644 src/bitshuffle.h
 create mode 100644 tests/bitshuffle_benchmark.cc
 create mode 100644 tests/bitshuffle_test.cc
 create mode 100644 third_party/BUILD.bazel
 create mode 100644 third_party/bitshuffle.BUILD

diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000..875f2e3
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,52 @@
+common --enable_platform_specific_config
+
+build --features=layering_check
+build --repo_env=BAZEL_CXXOPTS=-std=c++14
+
+build:ci --keep_going
+build:ci --test_output=all
+
+build:generic_gcc --copt=-Wall
+build:generic_gcc --copt=-Wextra
+build:generic_gcc --copt=-Wpointer-arith
+build:generic_gcc --per_file_copt=^//@-Wundef,-Werror
+build:generic_gcc --copt=-Wno-unused-parameter
+
+build:clang --config=generic_gcc
+build:clang --repo_env=BAZEL_COMPILER=clang
+build:clang --repo_env=CC=clang
+build:clang --repo_env=CXX=clang++
+# https://github.com/bazelbuild/bazel/issues/11122
+# https://github.com/bazelbuild/bazel/issues/12797
+build:clang --linkopt=-fsanitize-link-c++-runtime
+
+build:libc++ --cxxopt=-stdlib=libc++
+build:libc++ --linkopt=-stdlib=libc++
+
+build:libc++-static --config=libc++
+build:libc++-static --repo_env=BAZEL_LINKLIBS=-l%:libc++.a:-l%:libc++abi.a:-lm
+build:libc++-static --repo_env=BAZEL_LINKOPTS=-pthread
+
+build:macos --cxxopt=-std=c++14
+
+build:macos-x86_64 --copt=-arch --copt=x86_64
+build:macos-x86_64 --linkopt=-arch --linkopt=x86_64
+
+build:msvc --per_file_copt=^external/@/W2
+build:msvc --per_file_copt=^//@/W4,/WX
+build:msvc --conlyopt=/Za
+build:msvc --cxxopt=/std:c++14
+
+build:asan --features=asan
+build:asan --test_env=ASAN_OPTIONS=check_initialization_order=1:detect_invalid_pointer_pairs=1:detect_stack_use_after_return=1:strict_init_order=1:strict_string_checks=1
+
+build:asan-msvc --config=asan
+build:asan-msvc --copt=/fsanitize=address
+build:asan-msvc --features=frame_pointer
+build:asan-msvc --features=static_link_msvcrt
+
+build:ubsan --features=ubsan
+build:ubsan --test_env=UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1
+
+build:ubsan-extra --config=ubsan
+build:ubsan-extra --copt=-fsanitize=implicit-signed-integer-truncation
diff --git a/.bazelversion b/.bazelversion
new file mode 100644
index 0000000..dc0208a
--- /dev/null
+++ b/.bazelversion
@@ -0,0 +1 @@
+6.3.1
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..28afbba
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,9 @@
+BasedOnStyle: Chromium
+AccessModifierOffset: -4
+AlwaysBreakBeforeMultilineStrings: false
+BraceWrapping:
+  AfterControlStatement: MultiLine
+BreakBeforeBraces: Custom
+ColumnLimit: 100
+IndentWidth: 4
+SpacesBeforeTrailingComments: 1
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..e43f5a3
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,15 @@
+---
+Checks: "\
+bugprone-*,\
+readability-inconsistent-declaration-parameter-name,\
+readability-redundant-*,\
+readability-uppercase-literal-suffix,\
+-bugprone-easily-swappable-parameters,\
+-bugprone-implicit-widening-of-multiplication-result,\
+-bugprone-narrowing-conversions,\
+-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
+CheckOptions:
+  readability-inconsistent-declaration-parameter-name.IgnoreMacros: false
+  readability-inconsistent-declaration-parameter-name.Strict: true
+WarningsAsErrors: '*'
+...
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..7a376de
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,256 @@
+name: Main
+
+on:
+  push:
+    branches: [main, ci]
+
+jobs:
+  bazel-linux-arm64-clang:
+    runs-on: [self-hosted, linux, arm64]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          echo "build --config=ci --config=clang" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Build (libc++)
+        run: |
+          bazel --bazelrc=job.rc build -c opt --config=libc++ //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure
+      - name: Bench
+        run: |
+          bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192'
+  bazel-linux-arm64-gcc:
+    runs-on: [self-hosted, linux, arm64]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          echo "build --config=ci --config=generic_gcc" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan //:bitshuffle_test --test_arg=--gtest_break_on_failure
+      - name: Bench
+        run: |
+          bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192'
+  bazel-linux-x86_64-icelake-clang:
+    strategy:
+      matrix:
+        m:
+          - "sse2"
+          - "avx2"
+          - "avx512bw avx512vl"
+          - "avx512bw avx512vl avx512vbmi gfni"
+        use_ifunc: [0, 1]
+    runs-on: [self-hosted, linux, x64]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          m="${{ matrix.m }}"
+          echo "build --config=ci --config=clang --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Build (libc++)
+        run: |
+          bazel --bazelrc=job.rc build -c opt --config=libc++ //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure
+      - name: Bench
+        run: |
+          bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192'
+  bazel-linux-x86_64-icelake-gcc:
+    strategy:
+      matrix:
+        m:
+          - "no-sse2"
+          - "sse2"
+          - "avx2"
+          - "avx512bw avx512vl"
+          - "avx512bw avx512vl avx512vbmi gfni"
+        use_ifunc: [0, 1]
+    runs-on: [self-hosted, linux, x64]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          m="${{ matrix.m }}"
+          echo "build --config=ci --config=generic_gcc --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan //:bitshuffle_test --test_arg=--gtest_break_on_failure
+      - name: Bench
+        run: |
+          bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192'
+  bazel-linux-x86_64-clang:
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, ubuntu-latest]
+        m:
+          - "sse2"
+          - "avx2"
+        use_ifunc: [0, 1]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          m="${{ matrix.m }}"
+          echo "build --config=ci --config=clang --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure
+  bazel-linux-x86_64-gcc:
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, ubuntu-latest]
+        m:
+          - "no-sse2"
+          - "sse2"
+          - "avx2"
+        use_ifunc: [0, 1]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install
+        run: |
+          # Bazel prefers LLD to GNU gold. Install LLD to avoid gold relocation overflow bug on ubuntu-20.04.
+          # https://mail.gnu.org/archive/html/bug-binutils/2020-04/msg00329.html
+          sudo apt-get update
+          sudo apt-get install lld
+      - name: Configure
+        run: |
+          m="${{ matrix.m }}"
+          echo "build --config=ci --config=generic_gcc --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan //:bitshuffle_test --test_arg=--gtest_break_on_failure
+  bazel-macos-x86_64:
+    strategy:
+      matrix:
+        os: [macos-11, macos-12, macos-13]
+        m:
+          - "sse2"
+          - "avx2"
+        exclude:
+          - {os: macos-11, m: "avx2"}
+          - {os: macos-12, m: "avx2"}
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          m="${{ matrix.m }}"
+          echo "build --config=ci --config=generic_gcc --copt=-m${m// / --copt=-m}" | tee job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure
+  bazel-windows-msvc:
+    strategy:
+      matrix:
+        # FIXME(kal): Add windows-2022 once it works with bazel.
+        # https://github.com/bazelbuild/bazel/issues/18592
+        os: [windows-2019]
+        cpu: [x64, x64_x86]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          "build --cpu=${{ matrix.cpu }}_windows --config=ci --config=msvc" | Out-File -FilePath job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=/O2 --config=asan-msvc //:bitshuffle_test --test_arg=--gtest_break_on_failure
+  bazel-windows-msvc-arch:
+    strategy:
+      matrix:
+        include:
+          - {cpu: x64, arch: AVX}
+          - {cpu: x64, arch: AVX2}
+          - {cpu: x64_x86, arch: IA32}
+          - {cpu: x64_x86, arch: SSE}
+          - {cpu: x64_x86, arch: SSE2}
+          - {cpu: x64_x86, arch: AVX}
+          - {cpu: x64_x86, arch: AVX2}
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Configure
+        run: |
+          "build --cpu=${{ matrix.cpu }}_windows --config=ci --config=msvc --copt=/arch:${{ matrix.arch }}" | Out-File -FilePath job.rc
+      - name: Build
+        run: |
+          bazel --bazelrc=job.rc build -c opt //...
+      - name: Test
+        run: |
+          bazel --bazelrc=job.rc test --copt=/O2 --config=asan-msvc //:bitshuffle_test --test_arg=--gtest_break_on_failure
+  clang-format:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Clang-Format
+        run: |
+          set -eu
+          srcs=$(git ls-files -- '*.c' '*.cc' '*.h')
+          clang-format --dry-run --Werror --verbose -- ${srcs}
+  clang-tidy:
+    strategy:
+      matrix:
+        m:
+          - "no-sse2"
+          - "sse2"
+          - "avx2"
+          - "avx512bw avx512vl"
+          - "avx512bw avx512vl avx512vbmi gfni"
+        use_ifunc: [0, 1]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install
+        run: |
+          sudo apt-get update
+          sudo apt-get install clang-tidy-15
+      - name: Configure
+        run: |
+          m="${{ matrix.m }}"
+          echo "CFLAGS=-m${m// / -m} -DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" >>"${GITHUB_ENV}"
+      - name: Clang-Tidy
+        run: |
+          set -eu
+          clang-tidy-15 --warnings-as-errors='*' src/bitshuffle.c -- ${CFLAGS}
+          clang-tidy-15 --warnings-as-errors='*' src/bitshuffle.c -- ${CFLAGS} -DNDEBUG
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a6ef824
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/bazel-*
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644
index 0000000..ed7c113
--- /dev/null
+++ b/BUILD.bazel
@@ -0,0 +1,43 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "bitshuffle",
+    srcs = [
+        "src/bitshuffle.c",
+    ],
+    hdrs = [
+        "src/bitshuffle.h",
+    ],
+)
+
+cc_binary(
+    name = "bitshuffle_benchmark",
+    srcs = [
+        "tests/bitshuffle_benchmark.cc",
+    ],
+    copts = [
+        "-Iexternal",
+    ],
+    linkstatic = True,
+    deps = [
+        ":bitshuffle",
+        "@benchmark",
+        "@benchmark//:benchmark_main",
+        "@bitshuffle//:bitshuffle_core",
+    ],
+)
+
+cc_test(
+    name = "bitshuffle_test",
+    srcs = [
+        "tests/bitshuffle_test.cc",
+    ],
+    linkstatic = True,
+    deps = [
+        ":bitshuffle",
+        "@googletest//:gtest",
+        "@googletest//:gtest_main",
+    ],
+)
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..ab55200
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 Kal Conley
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..0df1f0c
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,19 @@
+Copyright (c) 2023 Kal Conley
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8edef50
--- /dev/null
+++ b/README.md
@@ -0,0 +1,134 @@
+# Bitshuffle
+
+Transpose bits to improve data compression.
+
+[![Build Status][actions-badge]][actions-url]
+
+[actions-badge]: https://github.com/kalcutter/bitshuffle/actions/workflows/main.yml/badge.svg
+[actions-url]: https://github.com/kalcutter/bitshuffle/actions/workflows/main.yml?query=branch%3Amain
+
+## Overview
+
+Bitshuffle is a lossless filter for improving the compression of typed data. It works by
+transposing the bits of the data to reduce entropy.
+
+This repository contains a highly optimized Bitshuffle implementation for modern processors.
+
+For more information about Bitshuffle, refer to the classic implementation here:
+<https://github.com/kiyo-masui/bitshuffle.git>.
+
+## Scope
+
+This library only implements the core Bitshuffle transpose operation (exposed as a C interface).
+
+A Python interface is not provided.
+
+## Features
+
+* Implemented in C99.
+* Optimized with SIMD instructions including SSE2, AVX2, and AVX-512.
+* Runtime dispatch based on available CPU features (requires GNU IFUNC support).
+* Does not allocate memory.
+* Support for Clang, GCC, ICC, and MSVC.
+* Tested on Linux, macOS, and Windows.
+
+## Performance
+
+The performance of this implementation is excellent.
+
+Compared to [`bshuf_trans_bit_elem`][bshuf_trans_bit_elem] and
+[`bshuf_untrans_bit_elem`][bshuf_untrans_bit_elem] from the classic implementation, this code
+yields a typical speedup between **1.3x** and **10x** depending on the CPU architecture, function,
+and arguments.
+
+## API
+
+The public interface is declared in the header file [bitshuffle.h](src/bitshuffle.h). Two public
+functions are defined:
+
+```c++
+/* Transpose bits for compression.
+ *
+ * This function performs Bitshuffle transposition of a single block. The block
+ * size in bytes is given by the product of `size` and `elem_size`.
+ *
+ * If required, the `scratch` argument must point to a buffer that the function
+ * uses for scratch purposes. The size of this buffer is given by the block
+ * size.
+ *
+ * On success, the function returns 0; otherwise, -1 is returned to indicate an
+ * error. In case of error, the memory pointed to by `out` and `scratch` is left
+ * unmodified.
+ *
+ * Errors
+ * ------
+ * The function returns -1 to indicate an error if:
+ *
+ * - The `scratch` argument is `NULL` and a scratch buffer is required for the
+ *   specified element size.
+ * - The `size` argument is not a multiple of 8.
+ */
+int bitshuf_encode_block(char* restrict out,
+                         const char* restrict in,
+                         char* restrict scratch,
+                         size_t size,
+                         size_t elem_size);
+```
+
+```c++
+/* Untranspose bits after decompression.
+ *
+ * This function performs the inverse of `bitshuf_encode_block()`.
+ *
+ * If required, the `scratch` argument must point to a buffer that the function
+ * uses for scratch purposes. The size of this buffer is given by the block
+ * size.
+ *
+ * On success, the function returns 0; otherwise, -1 is returned to indicate an
+ * error. In case of error, the memory pointed to by `out` and `scratch` is left
+ * unmodified.
+ *
+ * Errors
+ * ------
+ * The function returns -1 to indicate an error if:
+ *
+ * - The `scratch` argument is `NULL` and a scratch buffer is required for the
+ *   specified element size.
+ * - The `size` argument is not a multiple of 8.
+ */
+int bitshuf_decode_block(char* restrict out,
+                         const char* restrict in,
+                         char* restrict scratch,
+                         size_t size,
+                         size_t elem_size);
+```
+
+These functions perform the same operations as [`bshuf_trans_bit_elem`][bshuf_trans_bit_elem] and
+[`bshuf_untrans_bit_elem`][bshuf_untrans_bit_elem], respectively.
+
+The header file is compatible with both C89 and C++.
+
+[bshuf_trans_bit_elem]: https://github.com/kiyo-masui/bitshuffle/blob/b9a1546133959298c56eee686932dbb18ff80f7a/src/bitshuffle_internals.h#L50
+[bshuf_untrans_bit_elem]: https://github.com/kiyo-masui/bitshuffle/blob/b9a1546133959298c56eee686932dbb18ff80f7a/src/bitshuffle_internals.h#L59
+
+## Caveats
+
+Only little-endian architectures are supported. Support for big-endian machines
+is not planned.
+
+## License
+
+This repository is licensed under either of
+
+* Apache License, Version 2.0
+  ([LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>)
+* MIT license
+  ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)
+
+at your option.
+
+## Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
+dual licensed as above, without any additional terms or conditions.
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..322c2bc
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,23 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "benchmark",
+    sha256 = "6430e4092653380d9dc4ccb45a1e2dc9259d581f4866dc0759713126056bc1d7",
+    strip_prefix = "benchmark-1.7.1",
+    url = "https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz",
+)
+
+http_archive(
+    name = "bitshuffle",
+    build_file = "//third_party:bitshuffle.BUILD",
+    sha256 = "2631aaa5d4c24e51415c7b1827d4f9dcf505ad8db03738210da9ce6dab8f5870",
+    strip_prefix = "bitshuffle-0.5.1",
+    url = "https://github.com/kiyo-masui/bitshuffle/archive/refs/tags/0.5.1.tar.gz",
+)
+
+http_archive(
+    name = "googletest",
+    sha256 = "ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363",
+    strip_prefix = "googletest-1.13.0",
+    url = "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz",
+)
diff --git a/src/bitshuffle.c b/src/bitshuffle.c
new file mode 100644
index 0000000..0ee0b9b
--- /dev/null
+++ b/src/bitshuffle.c
@@ -0,0 +1,1709 @@
+// SPDX-License-Identifier: MIT OR Apache-2.0
+// Copyright (c) 2023 Kal Conley
+#include "bitshuffle.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#include <immintrin.h>
+#endif
+
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if defined(__INTEL_COMPILER)
+#pragma warning(disable : 177) // entity-kind "entity" was declared but never referenced
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244) // conversion from 'type1' to 'type2', possible loss of data
+#endif
+
+#if defined(_MSC_VER)
+#if defined(_M_IX86) && _M_IX86_FP == 2 || defined(_M_X64)
+#ifndef __SSE2__
+#define __SSE2__ 1
+#endif
+#endif
+#endif
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+        __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ||               \
+    defined(__BIG_ENDIAN__)
+#error big endian not supported
+#endif
+
+#ifndef BITSHUF_USE_IFUNC
+#if defined(__INTEL_COMPILER) || defined(__clang__) && __clang_major__ < 8
+#define BITSHUF_USE_IFUNC 0 // GFNI not supported by compiler.
+#endif
+#endif
+#ifndef BITSHUF_USE_IFUNC
+#if (__has_attribute(ifunc) && __has_attribute(target) && __has_builtin(__builtin_cpu_init) && \
+     __has_builtin(__builtin_cpu_is) && __has_builtin(__builtin_cpu_supports)) ||              \
+    (defined(__GNUC__) && __GNUC__ >= 8)
+#define BITSHUF_USE_IFUNC 1
+#else
+#define BITSHUF_USE_IFUNC 0
+#endif
+#endif
+
+#define STRINGIZE(x) #x
+
+#if __has_attribute(target) && !defined(__INTEL_COMPILER)
+#define ATTRIBUTE_TARGET(x) __attribute__((target(x)))
+#else
+#define ATTRIBUTE_TARGET(x)
+#endif
+
+#if __has_attribute(always_inline) || defined(__GNUC__)
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define ALWAYS_INLINE __forceinline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+#if __has_attribute(noinline) || defined(__GNUC__)
+#define NO_INLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define NO_INLINE __declspec(noinline)
+#else
+#define NO_INLINE
+#endif
+
+#if __has_attribute(no_sanitize_address)
+#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
+#else
+#define NO_SANITIZE_ADDRESS
+#endif
+
+#if __has_attribute(no_sanitize_memory)
+#define NO_SANITIZE_MEMORY __attribute__((no_sanitize_memory))
+#else
+#define NO_SANITIZE_MEMORY
+#endif
+
+#if __has_attribute(no_sanitize_thread)
+#define NO_SANITIZE_THREAD __attribute__((no_sanitize_thread))
+#else
+#define NO_SANITIZE_THREAD
+#endif
+
+#if __has_attribute(disable_sanitizer_instrumentation)
+#define DISABLE_SANITIZER_INSTRUMENTATION __attribute__((disable_sanitizer_instrumentation))
+#else
+#define DISABLE_SANITIZER_INSTRUMENTATION
+#endif
+
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH __attribute__((fallthrough))
+#else
+#define FALLTHROUGH
+#endif
+
+#if __has_builtin(__builtin_expect) || defined(__GNUC__)
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define UNLIKELY(x) (x)
+#endif
+
+// clang-format off
+#define IMPLEMENT_IFUNC(NAME, PARAMS)                         \
+    __attribute__((ifunc(STRINGIZE(NAME##_resolver))))        \
+    static void NAME PARAMS;                                  \
+                                                              \
+    DISABLE_SANITIZER_INSTRUMENTATION                         \
+    NO_SANITIZE_ADDRESS NO_SANITIZE_MEMORY NO_SANITIZE_THREAD \
+    static void (*NAME##_resolver(void))PARAMS
+// clang-format on
+
+#define IMPLEMENT_LOAD_FUNCTION(NAME, TYPE)                \
+    static ALWAYS_INLINE TYPE NAME(const void* mem_addr) { \
+        TYPE a;                                            \
+        memcpy(&a, mem_addr, sizeof(a));                   \
+        return a;                                          \
+    }
+
+#define IMPLEMENT_STORE_FUNCTION(NAME, TYPE)                       \
+    static ALWAYS_INLINE void NAME(void* mem_addr, const TYPE a) { \
+        memcpy(mem_addr, &a, sizeof(a));                           \
+    }
+
+#if !defined(__SSE2__)
+IMPLEMENT_LOAD_FUNCTION(LOAD_U64, uint64_t)
+#endif
+IMPLEMENT_STORE_FUNCTION(STORE_U8, uint8_t)
+IMPLEMENT_STORE_FUNCTION(STORE_U64, uint64_t)
+
+// Computes the transpose of an 8x8 bit matrix.
+// Ref: "Hacker's Delight" 7-3 by Henry Warren.
+static uint64_t transpose8(uint64_t x) {
+    uint64_t t;
+    t = (x ^ (x >> 7)) & 0x00aa00aa00aa00aa;
+    x = (x ^ t ^ (t << 7));
+    t = (x ^ (x >> 14)) & 0x0000cccc0000cccc;
+    x = (x ^ t ^ (t << 14));
+    t = (x ^ (x >> 28)) & 0x00000000f0f0f0f0;
+    x = (x ^ t ^ (t << 28));
+    return x;
+}
+
+#if !defined(__SSE2__)
+NO_INLINE
+static void bitshuf_trans_bit(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    for (size_t i = 0; i < size; i++) {
+        const uint64_t a = LOAD_U64(&in[i * 8]);
+        const uint64_t x = transpose8(a);
+        STORE_U8(&out[0 * size + i], x);
+        STORE_U8(&out[1 * size + i], x >> 8 * 1);
+        STORE_U8(&out[2 * size + i], x >> 8 * 2);
+        STORE_U8(&out[3 * size + i], x >> 8 * 3);
+        STORE_U8(&out[4 * size + i], x >> 8 * 4);
+        STORE_U8(&out[5 * size + i], x >> 8 * 5);
+        STORE_U8(&out[6 * size + i], x >> 8 * 6);
+        STORE_U8(&out[7 * size + i], x >> 8 * 7);
+    }
+}
+
+NO_INLINE
+static void bitshuf_trans_byte(char* restrict out,
+                               const char* restrict in,
+                               size_t size,
+                               size_t elem_size) {
+    assert(size % 8 == 0);
+
+    for (size_t i = 0; i < size; i += 8) {
+        for (size_t j = 0; j < elem_size; j++) {
+            for (size_t k = 0; k < 8; k++) {
+                out[j * size + (i + k)] = in[(i + k) * elem_size + j];
+            }
+        }
+    }
+}
+
+NO_INLINE
+static void bitshuf_trans_byte_2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size = size / 8 * 8;
+
+    for (size_t i = 0; i < size; i++) {
+        out[0 * size + i] = in[i * 2 + 0];
+        out[1 * size + i] = in[i * 2 + 1];
+    }
+}
+
+NO_INLINE
+static void bitshuf_trans_byte_4(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size = size / 8 * 8;
+
+    for (size_t i = 0; i < size; i++) {
+        out[0 * size + i] = in[i * 4 + 0];
+        out[1 * size + i] = in[i * 4 + 1];
+        out[2 * size + i] = in[i * 4 + 2];
+        out[3 * size + i] = in[i * 4 + 3];
+    }
+}
+
+static void bitshuf_trans_byte_8(char* restrict out, const char* restrict in, size_t size) {
+    bitshuf_trans_byte(out, in, size, 8);
+}
+#endif
+
+static void bitshuf_untrans_bit_tail(char* restrict out,
+                                     const char* restrict in,
+                                     size_t size,
+                                     size_t index) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    for (size_t i = index; i < size; i++) {
+        const uint64_t a = (uint64_t)(uint8_t)in[0 * size + i] |
+                           (uint64_t)(uint8_t)in[1 * size + i] << 8 * 1 |
+                           (uint64_t)(uint8_t)in[2 * size + i] << 8 * 2 |
+                           (uint64_t)(uint8_t)in[3 * size + i] << 8 * 3 |
+                           (uint64_t)(uint8_t)in[4 * size + i] << 8 * 4 |
+                           (uint64_t)(uint8_t)in[5 * size + i] << 8 * 5 |
+                           (uint64_t)(uint8_t)in[6 * size + i] << 8 * 6 |
+                           (uint64_t)(uint8_t)in[7 * size + i] << 8 * 7;
+        STORE_U64(&out[i * 8], transpose8(a));
+    }
+}
+
+#if !defined(__SSE2__)
+NO_INLINE
+static void bitshuf_untrans_bit(char* restrict out, const char* restrict in, size_t size) {
+    bitshuf_untrans_bit_tail(out, in, size, 0);
+}
+
+NO_INLINE
+static void bitshuf_untrans_byte(char* restrict out,
+                                 const char* restrict in,
+                                 size_t size,
+                                 size_t elem_size) {
+    assert(size % 8 == 0);
+
+    for (size_t i = 0; i < size; i += 8) {
+        for (size_t j = 0; j < elem_size; j++) {
+            for (size_t k = 0; k < 8; k++) {
+                out[(i + k) * elem_size + j] = in[j * size + (i + k)];
+            }
+        }
+    }
+}
+
+NO_INLINE
+static void bitshuf_untrans_byte_2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size = size / 8 * 8;
+
+    for (size_t i = 0; i < size; i++) {
+        out[i * 2 + 0] = in[0 * size + i];
+        out[i * 2 + 1] = in[1 * size + i];
+    }
+}
+
+NO_INLINE
+static void bitshuf_untrans_byte_4(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size = size / 8 * 8;
+
+    for (size_t i = 0; i < size; i++) {
+        out[i * 4 + 0] = in[0 * size + i];
+        out[i * 4 + 1] = in[1 * size + i];
+        out[i * 4 + 2] = in[2 * size + i];
+        out[i * 4 + 3] = in[3 * size + i];
+    }
+}
+
+static void bitshuf_untrans_byte_8(char* restrict out, const char* restrict in, size_t size) {
+    bitshuf_untrans_byte(out, in, size, 8);
+}
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+
+IMPLEMENT_LOAD_FUNCTION(LOAD_I64, int64_t)
+IMPLEMENT_STORE_FUNCTION(STORE_U16, uint16_t)
+IMPLEMENT_STORE_FUNCTION(STORE_U32, uint32_t)
+
+#define MM256_SETR_M128I(lo, hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+
+#if defined(__clang__)
+#define X(A)                   \
+    ({                         \
+        __asm__("" : "+x"(A)); \
+        (A);                   \
+    })
+#else
+#define X(A) (A)
+#endif
+
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx512vbmi,avx512vl,gfni")
+static void bitshuf_trans_bit_avx512vbmi_gfni(char* restrict out,
+                                              const char* restrict in,
+                                              size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    const __m512i BSWAP64 = _mm512_set_epi64(
+        0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607,
+        0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607);
+    const __m512i C0 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3931292119110901, 0x3830282018100800);
+    const __m512i C1 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3b332b231b130b03, 0x3a322a221a120a02);
+    const __m512i C2 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3d352d251d150d05, 0x3c342c241c140c04);
+    const __m512i C3 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3f372f271f170f07, 0x3e362e261e160e06);
+    const __m512i I8 = _mm512_set1_epi64(0x8040201008040201);
+    size_t i = 0;
+    for (; i + 8 <= size; i += 8) {
+        const __m512i a = _mm512_loadu_si512(&in[i * 8]);
+        const __m512i u = _mm512_gf2p8affine_epi64_epi8(I8, _mm512_shuffle_epi8(a, BSWAP64), 0x00);
+        const __m128i u0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C0, u));
+        const __m128i u1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C1, u));
+        const __m128i u2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C2, u));
+        const __m128i u3 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C3, u));
+        _mm_storel_epi64((__m128i*)&out[0 * size + i], u0);
+        _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u0, 8));
+        _mm_storel_epi64((__m128i*)&out[2 * size + i], u1);
+        _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(u1, 8));
+        _mm_storel_epi64((__m128i*)&out[4 * size + i], u2);
+        _mm_storel_epi64((__m128i*)&out[5 * size + i], _mm_srli_si128(u2, 8));
+        _mm_storel_epi64((__m128i*)&out[6 * size + i], u3);
+        _mm_storel_epi64((__m128i*)&out[7 * size + i], _mm_srli_si128(u3, 8));
+    }
+    if (i < size) {
+        const __mmask8 k = (1U << (size - i)) - 1;
+        const __m512i a = _mm512_maskz_loadu_epi64(k, &in[i * 8]);
+        const __m512i u = _mm512_gf2p8affine_epi64_epi8(I8, _mm512_shuffle_epi8(a, BSWAP64), 0x00);
+        const __m128i u0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C0, u));
+        const __m128i u1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C1, u));
+        const __m128i u2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C2, u));
+        const __m128i u3 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C3, u));
+        _mm_mask_storeu_epi8(&out[0 * size + i], k, u0);
+        _mm_mask_storeu_epi8(&out[1 * size + i], k, _mm_srli_si128(u0, 8));
+        _mm_mask_storeu_epi8(&out[2 * size + i], k, u1);
+        _mm_mask_storeu_epi8(&out[3 * size + i], k, _mm_srli_si128(u1, 8));
+        _mm_mask_storeu_epi8(&out[4 * size + i], k, u2);
+        _mm_mask_storeu_epi8(&out[5 * size + i], k, _mm_srli_si128(u2, 8));
+        _mm_mask_storeu_epi8(&out[6 * size + i], k, u3);
+        _mm_mask_storeu_epi8(&out[7 * size + i], k, _mm_srli_si128(u3, 8));
+    }
+}
+#endif
+
+#if defined(__AVX512BW__) && defined(__AVX512VL__) || BITSHUF_USE_IFUNC
+IMPLEMENT_STORE_FUNCTION(STORE_MASK64, __mmask64)
+
+NO_INLINE
+ATTRIBUTE_TARGET("avx512bw,avx512vl")
+static void bitshuf_trans_bit_avx512bw(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    const __m512i C0 = _mm512_set1_epi8(0x01);
+    const __m512i C1 = _mm512_set1_epi8(0x02);
+    const __m512i C2 = _mm512_set1_epi8(0x04);
+    const __m512i C3 = _mm512_set1_epi8(0x08);
+    const __m512i C4 = _mm512_set1_epi8(0x10);
+    const __m512i C5 = _mm512_set1_epi8(0x20);
+    const __m512i C6 = _mm512_set1_epi8(0x40);
+    size_t i = 0;
+    for (; i + 8 <= size; i += 8) {
+        const __m512i a = _mm512_loadu_si512(&in[i * 8]);
+        STORE_MASK64(&out[0 * size + i], _mm512_test_epi8_mask(a, C0));
+        STORE_MASK64(&out[1 * size + i], _mm512_test_epi8_mask(a, C1));
+        STORE_MASK64(&out[2 * size + i], _mm512_test_epi8_mask(a, C2));
+        STORE_MASK64(&out[3 * size + i], _mm512_test_epi8_mask(a, C3));
+        STORE_MASK64(&out[4 * size + i], _mm512_test_epi8_mask(a, C4));
+        STORE_MASK64(&out[5 * size + i], _mm512_test_epi8_mask(a, C5));
+        STORE_MASK64(&out[6 * size + i], _mm512_test_epi8_mask(a, C6));
+        STORE_MASK64(&out[7 * size + i], _mm512_movepi8_mask(a));
+    }
+    if (i < size) {
+        const __mmask8 k = (1U << (size - i)) - 1;
+        const __m512i a = _mm512_maskz_loadu_epi64(k, &in[i * 8]);
+        // clang-format off
+        _mm_mask_storeu_epi8(&out[0 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C0)));
+        _mm_mask_storeu_epi8(&out[1 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C1)));
+        _mm_mask_storeu_epi8(&out[2 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C2)));
+        _mm_mask_storeu_epi8(&out[3 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C3)));
+        _mm_mask_storeu_epi8(&out[4 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C4)));
+        _mm_mask_storeu_epi8(&out[5 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C5)));
+        _mm_mask_storeu_epi8(&out[6 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C6)));
+        _mm_mask_storeu_epi8(&out[7 * size + i], k, _mm_set_epi64x(0, _mm512_movepi8_mask(a)));
+        // clang-format on
+    }
+}
+#endif
+
+#if defined(__AVX2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx2")
+static void bitshuf_trans_bit_avx2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    size_t i = 0;
+    for (; i + 4 <= size; i += 4) {
+        const __m256i a = _mm256_loadu_si256((const __m256i*)&in[i * 8]);
+        __m256i u;
+        STORE_U32(&out[7 * size + i], _mm256_movemask_epi8(u = a));
+        STORE_U32(&out[6 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
+        STORE_U32(&out[5 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
+        STORE_U32(&out[4 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
+        STORE_U32(&out[3 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
+        STORE_U32(&out[2 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
+        STORE_U32(&out[1 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
+        STORE_U32(&out[0 * size + i], _mm256_movemask_epi8(_mm256_add_epi8(X(u), u)));
+    }
+    if (i + 2 <= size) {
+        const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 8]);
+        __m128i u;
+        STORE_U16(&out[7 * size + i], _mm_movemask_epi8(u = a));
+        STORE_U16(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
+        i += 2;
+    }
+    if (i < size) {
+        const __m128i a = _mm_loadl_epi64((const __m128i*)&in[i * 8]);
+        __m128i u;
+        STORE_U8(&out[7 * size + i], _mm_movemask_epi8(u = a));
+        STORE_U8(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
+    }
+}
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_trans_bit_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    size_t i = 0;
+    for (; i + 2 <= size; i += 2) {
+        const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 8]);
+        __m128i u;
+        STORE_U16(&out[7 * size + i], _mm_movemask_epi8(u = a));
+        STORE_U16(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U16(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
+    }
+    if (i < size) {
+        const __m128i a = _mm_loadl_epi64((const __m128i*)&in[i * 8]);
+        __m128i u;
+        STORE_U8(&out[7 * size + i], _mm_movemask_epi8(u = a));
+        STORE_U8(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
+        STORE_U8(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
+    }
+}
+#endif
+
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__)
+#define bitshuf_trans_bit bitshuf_trans_bit_avx512vbmi_gfni
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_trans_bit_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx512vbmi") && __builtin_cpu_supports("avx512vl") &&
+        __builtin_cpu_supports("gfni") && !__builtin_cpu_is("intel"))
+    {
+        return bitshuf_trans_bit_avx512vbmi_gfni;
+    }
+#if defined(__AVX512BW__) && defined(__AVX512VL__)
+    return bitshuf_trans_bit_avx512bw;
+#else
+    if (__builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512vl"))
+        return bitshuf_trans_bit_avx512bw;
+#if defined(__AVX2__)
+    return bitshuf_trans_bit_avx2;
+#else
+    if (__builtin_cpu_supports("avx2"))
+        return bitshuf_trans_bit_avx2;
+#if defined(__SSE2__)
+    return bitshuf_trans_bit_sse2;
+#else
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_trans_bit_sse2;
+
+    return bitshuf_trans_bit;
+#endif
+#endif
+#endif
+}
+#define bitshuf_trans_bit bitshuf_trans_bit_ifunc
+#elif defined(__AVX512BW__) && defined(__AVX512VL__)
+#define bitshuf_trans_bit bitshuf_trans_bit_avx512bw
+#elif defined(__AVX2__)
+#define bitshuf_trans_bit bitshuf_trans_bit_avx2
+#elif defined(__SSE2__)
+#define bitshuf_trans_bit bitshuf_trans_bit_sse2
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_trans_byte_sse2(char* restrict out,
+                                    const char* restrict in,
+                                    size_t size,
+                                    size_t elem_size) {
+    assert(size % 8 == 0);
+
+    size_t j = 0;
+    for (; j + 8 <= elem_size; j += 8) {
+        for (size_t i = 0; i < size; i += 8) {
+            const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(i + 0) * elem_size + j]);
+            const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(i + 1) * elem_size + j]);
+            const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(i + 2) * elem_size + j]);
+            const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(i + 3) * elem_size + j]);
+            const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(i + 4) * elem_size + j]);
+            const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(i + 5) * elem_size + j]);
+            const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(i + 6) * elem_size + j]);
+            const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(i + 7) * elem_size + j]);
+            __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+            __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+            __m128i u2 = _mm_unpacklo_epi8(a4, a5);
+            __m128i u3 = _mm_unpacklo_epi8(a6, a7);
+            const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+            const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+            const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
+            const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
+            u0 = _mm_unpacklo_epi32(v0, v2);
+            u1 = _mm_unpackhi_epi32(v0, v2);
+            u2 = _mm_unpacklo_epi32(v1, v3);
+            u3 = _mm_unpackhi_epi32(v1, v3);
+            _mm_storel_epi64((__m128i*)&out[(j + 0) * size + i], u0);
+            _mm_storel_epi64((__m128i*)&out[(j + 1) * size + i], _mm_srli_si128(u0, 8));
+            _mm_storel_epi64((__m128i*)&out[(j + 2) * size + i], u1);
+            _mm_storel_epi64((__m128i*)&out[(j + 3) * size + i], _mm_srli_si128(u1, 8));
+            _mm_storel_epi64((__m128i*)&out[(j + 4) * size + i], u2);
+            _mm_storel_epi64((__m128i*)&out[(j + 5) * size + i], _mm_srli_si128(u2, 8));
+            _mm_storel_epi64((__m128i*)&out[(j + 6) * size + i], u3);
+            _mm_storel_epi64((__m128i*)&out[(j + 7) * size + i], _mm_srli_si128(u3, 8));
+        }
+    }
+    if (j < elem_size) {
+        for (size_t i = 0; i + 8 < size; i += 8) {
+            const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(i + 0) * elem_size + j]);
+            const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(i + 1) * elem_size + j]);
+            const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(i + 2) * elem_size + j]);
+            const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(i + 3) * elem_size + j]);
+            const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(i + 4) * elem_size + j]);
+            const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(i + 5) * elem_size + j]);
+            const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(i + 6) * elem_size + j]);
+            const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(i + 7) * elem_size + j]);
+            __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+            __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+            __m128i u2 = _mm_unpacklo_epi8(a4, a5);
+            __m128i u3 = _mm_unpacklo_epi8(a6, a7);
+            const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+            const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+            const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
+            const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
+            u0 = _mm_unpacklo_epi32(v0, v2);
+            u1 = _mm_unpackhi_epi32(v0, v2);
+            u2 = _mm_unpacklo_epi32(v1, v3);
+            u3 = _mm_unpackhi_epi32(v1, v3);
+            switch (elem_size - j) {
+                case 7:
+                    _mm_storel_epi64((__m128i*)&out[(j + 6) * size + i], u3);
+                    FALLTHROUGH;
+                case 6:
+                    _mm_storel_epi64((__m128i*)&out[(j + 5) * size + i], _mm_srli_si128(u2, 8));
+                    FALLTHROUGH;
+                case 5:
+                    _mm_storel_epi64((__m128i*)&out[(j + 4) * size + i], u2);
+                    FALLTHROUGH;
+                case 4:
+                    _mm_storel_epi64((__m128i*)&out[(j + 3) * size + i], _mm_srli_si128(u1, 8));
+                    FALLTHROUGH;
+                case 3:
+                    _mm_storel_epi64((__m128i*)&out[(j + 2) * size + i], u1);
+                    FALLTHROUGH;
+                case 2:
+                    _mm_storel_epi64((__m128i*)&out[(j + 1) * size + i], _mm_srli_si128(u0, 8));
+                    FALLTHROUGH;
+                default:
+                    _mm_storel_epi64((__m128i*)&out[(j + 0) * size + i], u0);
+            }
+        }
+        for (; j < elem_size; j++) {
+            for (size_t i = size - 8; i < size; i++)
+                out[j * size + i] = in[i * elem_size + j];
+        }
+    }
+}
+#endif
+
+#if defined(__SSE2__)
+#define bitshuf_trans_byte bitshuf_trans_byte_sse2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_trans_byte_ifunc,
+                (char* restrict out, const char* restrict in, size_t size, size_t elem_size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_trans_byte_sse2;
+
+    return bitshuf_trans_byte;
+}
+#define bitshuf_trans_byte bitshuf_trans_byte_ifunc
+#endif
+
+#if defined(__AVX2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx2")
+static void bitshuf_trans_byte_2_avx2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    const __m256i MASK = _mm256_set1_epi16(0x00ff);
+    size_t i = 0;
+    for (; i + 32 <= size; i += 32) {
+        const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[i * 2]);
+        const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[i * 2 + 32]);
+        __m256i u0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a1), 1);
+        __m256i u1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+        const __m256i v0 = _mm256_and_si256(u0, MASK);
+        const __m256i v1 = _mm256_and_si256(u1, MASK);
+        const __m256i v2 = _mm256_srli_epi16(u0, 8);
+        const __m256i v3 = _mm256_srli_epi16(u1, 8);
+        u0 = _mm256_packus_epi16(v0, v1);
+        u1 = _mm256_packus_epi16(v2, v3);
+        _mm256_storeu_si256((__m256i*)&out[0 * size + i], u0);
+        _mm256_storeu_si256((__m256i*)&out[1 * size + i], u1);
+    }
+    if (i + 16 <= size) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 2]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 2 + 16]);
+        const __m128i u0 = _mm_and_si128(a0, _mm256_castsi256_si128(MASK));
+        const __m128i u1 = _mm_and_si128(a1, _mm256_castsi256_si128(MASK));
+        const __m128i u2 = _mm_srli_epi16(a0, 8);
+        const __m128i u3 = _mm_srli_epi16(a1, 8);
+        const __m128i v0 = _mm_packus_epi16(u0, u1);
+        const __m128i v1 = _mm_packus_epi16(u2, u3);
+        _mm_storeu_si128((__m128i*)&out[0 * size + i], v0);
+        _mm_storeu_si128((__m128i*)&out[1 * size + i], v1);
+        i += 16;
+    }
+    if (i < size) {
+        const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 2]);
+        const __m128i u0 = _mm_and_si128(a, _mm256_castsi256_si128(MASK));
+        const __m128i u1 = _mm_srli_epi16(a, 8);
+        const __m128i u = _mm_packus_epi16(u0, u1);
+        _mm_storel_epi64((__m128i*)&out[0 * size + i], u);
+        _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u, 8));
+    }
+}
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_trans_byte_2_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    const __m128i MASK = _mm_set1_epi16(0x00ff);
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 2]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 2 + 16]);
+        const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK));
+        const __m128i u1 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8));
+        _mm_storeu_si128((__m128i*)&out[0 * size + i], u0);
+        _mm_storeu_si128((__m128i*)&out[1 * size + i], u1);
+    }
+    if (i < size) {
+        const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 2]);
+        const __m128i u = _mm_packus_epi16(_mm_and_si128(a, MASK), _mm_srli_epi16(a, 8));
+        _mm_storel_epi64((__m128i*)&out[0 * size + i], u);
+        _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u, 8));
+    }
+}
+#endif
+
+#if defined(__AVX2__)
+#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_avx2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_trans_byte_2_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx2"))
+        return bitshuf_trans_byte_2_avx2;
+#if defined(__SSE2__)
+    return bitshuf_trans_byte_2_sse2;
+#else
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_trans_byte_2_sse2;
+
+    return bitshuf_trans_byte_2;
+#endif
+}
+#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_ifunc
+#elif defined(__SSE2__)
+#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_sse2
+#endif
+
+#if defined(__AVX2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx2")
+static void bitshuf_trans_byte_4_avx2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    const __m256i SHUF = _mm256_set_epi64x(0x0f0b07030e0a0602, 0x0d0905010c080400,
+                                           0x0f0b07030e0a0602, 0x0d0905010c080400);
+    const __m256i PERM = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[i * 4]);
+        const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[i * 4 + 32]);
+        const __m256i u0 = _mm256_shuffle_epi8(a0, SHUF);
+        const __m256i u1 = _mm256_shuffle_epi8(a1, SHUF);
+        const __m256i v0 = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi32(u0, u1), PERM);
+        const __m256i v1 = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi32(u0, u1), PERM);
+        _mm_storeu_si128((__m128i*)&out[0 * size + i], _mm256_castsi256_si128(v0));
+        _mm_storeu_si128((__m128i*)&out[1 * size + i], _mm256_extracti128_si256(v0, 1));
+        _mm_storeu_si128((__m128i*)&out[2 * size + i], _mm256_castsi256_si128(v1));
+        _mm_storeu_si128((__m128i*)&out[3 * size + i], _mm256_extracti128_si256(v1, 1));
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16]);
+        const __m128i u0 = _mm_shuffle_epi8(a0, _mm256_castsi256_si128(SHUF));
+        const __m128i u1 = _mm_shuffle_epi8(a1, _mm256_castsi256_si128(SHUF));
+        const __m128i v0 = _mm_unpacklo_epi32(u0, u1);
+        const __m128i v1 = _mm_unpackhi_epi32(u0, u1);
+        _mm_storel_epi64((__m128i*)&out[0 * size + i], v0);
+        _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(v0, 8));
+        _mm_storel_epi64((__m128i*)&out[2 * size + i], v1);
+        _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(v1, 8));
+    }
+}
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_trans_byte_4_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    const __m128i MASK = _mm_set1_epi16(0x00ff);
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 0]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 1]);
+        const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 2]);
+        const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 3]);
+        const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK));
+        const __m128i u1 = _mm_packus_epi16(_mm_and_si128(a2, MASK), _mm_and_si128(a3, MASK));
+        const __m128i u2 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8));
+        const __m128i u3 = _mm_packus_epi16(_mm_srli_epi16(a2, 8), _mm_srli_epi16(a3, 8));
+        const __m128i v0 = _mm_packus_epi16(_mm_and_si128(u0, MASK), _mm_and_si128(u1, MASK));
+        const __m128i v1 = _mm_packus_epi16(_mm_and_si128(u2, MASK), _mm_and_si128(u3, MASK));
+        const __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(u0, 8), _mm_srli_epi16(u1, 8));
+        const __m128i v3 = _mm_packus_epi16(_mm_srli_epi16(u2, 8), _mm_srli_epi16(u3, 8));
+        _mm_storeu_si128((__m128i*)&out[0 * size + i], v0);
+        _mm_storeu_si128((__m128i*)&out[1 * size + i], v1);
+        _mm_storeu_si128((__m128i*)&out[2 * size + i], v2);
+        _mm_storeu_si128((__m128i*)&out[3 * size + i], v3);
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16]);
+        const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK));
+        const __m128i u1 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8));
+        const __m128i v0 = _mm_packus_epi16(_mm_and_si128(u0, MASK), _mm_and_si128(u1, MASK));
+        const __m128i v1 = _mm_packus_epi16(_mm_srli_epi16(u0, 8), _mm_srli_epi16(u1, 8));
+        _mm_storel_epi64((__m128i*)&out[0 * size + i], v0);
+        _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(v0, 8));
+        _mm_storel_epi64((__m128i*)&out[2 * size + i], v1);
+        _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(v1, 8));
+    }
+}
+#endif
+
+#if defined(__AVX2__)
+#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_avx2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_trans_byte_4_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx2"))
+        return bitshuf_trans_byte_4_avx2;
+#if defined(__SSE2__)
+    return bitshuf_trans_byte_4_sse2;
+#else
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_trans_byte_4_sse2;
+
+    return bitshuf_trans_byte_4;
+#endif
+}
+#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_ifunc
+#elif defined(__SSE2__)
+#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_sse2
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_trans_byte_8_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 0]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 1]);
+        const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 2]);
+        const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 3]);
+        const __m128i a4 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 4]);
+        const __m128i a5 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 5]);
+        const __m128i a6 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 6]);
+        const __m128i a7 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 7]);
+        __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        __m128i u2 = _mm_unpacklo_epi8(a2, a3);
+        __m128i u3 = _mm_unpackhi_epi8(a2, a3);
+        __m128i u4 = _mm_unpacklo_epi8(a4, a5);
+        __m128i u5 = _mm_unpackhi_epi8(a4, a5);
+        __m128i u6 = _mm_unpacklo_epi8(a6, a7);
+        __m128i u7 = _mm_unpackhi_epi8(a6, a7);
+        __m128i v0 = _mm_unpacklo_epi8(u0, u1);
+        __m128i v1 = _mm_unpackhi_epi8(u0, u1);
+        __m128i v2 = _mm_unpacklo_epi8(u2, u3);
+        __m128i v3 = _mm_unpackhi_epi8(u2, u3);
+        __m128i v4 = _mm_unpacklo_epi8(u4, u5);
+        __m128i v5 = _mm_unpackhi_epi8(u4, u5);
+        __m128i v6 = _mm_unpacklo_epi8(u6, u7);
+        __m128i v7 = _mm_unpackhi_epi8(u6, u7);
+        u0 = _mm_unpacklo_epi32(v0, v2);
+        u1 = _mm_unpackhi_epi32(v0, v2);
+        u2 = _mm_unpacklo_epi32(v1, v3);
+        u3 = _mm_unpackhi_epi32(v1, v3);
+        u4 = _mm_unpacklo_epi32(v4, v6);
+        u5 = _mm_unpackhi_epi32(v4, v6);
+        u6 = _mm_unpacklo_epi32(v5, v7);
+        u7 = _mm_unpackhi_epi32(v5, v7);
+        v0 = _mm_unpacklo_epi64(u0, u4);
+        v1 = _mm_unpackhi_epi64(u0, u4);
+        v2 = _mm_unpacklo_epi64(u1, u5);
+        v3 = _mm_unpackhi_epi64(u1, u5);
+        v4 = _mm_unpacklo_epi64(u2, u6);
+        v5 = _mm_unpackhi_epi64(u2, u6);
+        v6 = _mm_unpacklo_epi64(u3, u7);
+        v7 = _mm_unpackhi_epi64(u3, u7);
+        _mm_storeu_si128((__m128i*)&out[0 * size + i], v0);
+        _mm_storeu_si128((__m128i*)&out[1 * size + i], v1);
+        _mm_storeu_si128((__m128i*)&out[2 * size + i], v2);
+        _mm_storeu_si128((__m128i*)&out[3 * size + i], v3);
+        _mm_storeu_si128((__m128i*)&out[4 * size + i], v4);
+        _mm_storeu_si128((__m128i*)&out[5 * size + i], v5);
+        _mm_storeu_si128((__m128i*)&out[6 * size + i], v6);
+        _mm_storeu_si128((__m128i*)&out[7 * size + i], v7);
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 0]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 1]);
+        const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 2]);
+        const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 3]);
+        __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        __m128i u2 = _mm_unpacklo_epi8(a2, a3);
+        __m128i u3 = _mm_unpackhi_epi8(a2, a3);
+        const __m128i v0 = _mm_unpacklo_epi8(u0, u1);
+        const __m128i v1 = _mm_unpackhi_epi8(u0, u1);
+        const __m128i v2 = _mm_unpacklo_epi8(u2, u3);
+        const __m128i v3 = _mm_unpackhi_epi8(u2, u3);
+        u0 = _mm_unpacklo_epi32(v0, v2);
+        u1 = _mm_unpackhi_epi32(v0, v2);
+        u2 = _mm_unpacklo_epi32(v1, v3);
+        u3 = _mm_unpackhi_epi32(v1, v3);
+        _mm_storel_epi64((__m128i*)&out[0 * size + i], u0);
+        _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u0, 8));
+        _mm_storel_epi64((__m128i*)&out[2 * size + i], u1);
+        _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(u1, 8));
+        _mm_storel_epi64((__m128i*)&out[4 * size + i], u2);
+        _mm_storel_epi64((__m128i*)&out[5 * size + i], _mm_srli_si128(u2, 8));
+        _mm_storel_epi64((__m128i*)&out[6 * size + i], u3);
+        _mm_storel_epi64((__m128i*)&out[7 * size + i], _mm_srli_si128(u3, 8));
+    }
+}
+#endif
+
+#if defined(__SSE2__)
+#define bitshuf_trans_byte_8 bitshuf_trans_byte_8_sse2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_trans_byte_8_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_trans_byte_8_sse2;
+
+    return bitshuf_trans_byte_8;
+}
+#define bitshuf_trans_byte_8 bitshuf_trans_byte_8_ifunc
+#endif
+
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx512vbmi,avx512vl,gfni")
+static void bitshuf_untrans_bit_avx512vbmi_gfni(char* restrict out,
+                                                const char* restrict in,
+                                                size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    const __m512i C = _mm512_set_epi64(0x070f171f474f575f, 0x060e161e464e565e, 0x050d151d454d555d,
+                                       0x040c141c444c545c, 0x030b131b434b535b, 0x020a121a424a525a,
+                                       0x0109111941495159, 0x0008101840485058);
+    const __m512i I8 = _mm512_set1_epi64(0x8040201008040201);
+    size_t i = 0;
+    for (; i + 8 <= size; i += 8) {
+#if defined(__x86_64__) || defined(_M_X64)
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const int64_t a1 = LOAD_I64(&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const int64_t a3 = LOAD_I64(&in[3 * size + i]);
+        const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
+        const int64_t a5 = LOAD_I64(&in[5 * size + i]);
+        const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
+        const int64_t a7 = LOAD_I64(&in[7 * size + i]);
+        const __m128i u0 = _mm_insert_epi64(a0, a1, 1);
+        const __m128i u1 = _mm_insert_epi64(a2, a3, 1);
+        const __m128i u2 = _mm_insert_epi64(a4, a5, 1);
+        const __m128i u3 = _mm_insert_epi64(a6, a7, 1);
+#else
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
+        const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
+        const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
+        const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
+        const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi64(a0, a1);
+        const __m128i u1 = _mm_unpacklo_epi64(a2, a3);
+        const __m128i u2 = _mm_unpacklo_epi64(a4, a5);
+        const __m128i u3 = _mm_unpacklo_epi64(a6, a7);
+#endif
+        const __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(u0), u1, 1);
+        const __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(u2), u3, 1);
+        __m512i u;
+        u = _mm512_permutex2var_epi8(_mm512_castsi256_si512(v0), C, _mm512_castsi256_si512(v1));
+        u = _mm512_gf2p8affine_epi64_epi8(I8, u, 0x00);
+        _mm512_storeu_si512(&out[i * 8], u);
+    }
+    if (i < size) {
+        const __mmask8 k = (1U << (size - i)) - 1;
+        const __m128i a0 = _mm_maskz_loadu_epi8(k, &in[0 * size + i]);
+        const __m128i a1 = _mm_maskz_loadu_epi8(k, &in[1 * size + i]);
+        const __m128i a2 = _mm_maskz_loadu_epi8(k, &in[2 * size + i]);
+        const __m128i a3 = _mm_maskz_loadu_epi8(k, &in[3 * size + i]);
+        const __m128i a4 = _mm_maskz_loadu_epi8(k, &in[4 * size + i]);
+        const __m128i a5 = _mm_maskz_loadu_epi8(k, &in[5 * size + i]);
+        const __m128i a6 = _mm_maskz_loadu_epi8(k, &in[6 * size + i]);
+        const __m128i a7 = _mm_maskz_loadu_epi8(k, &in[7 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi64(a0, a1);
+        const __m128i u1 = _mm_unpacklo_epi64(a2, a3);
+        const __m128i u2 = _mm_unpacklo_epi64(a4, a5);
+        const __m128i u3 = _mm_unpacklo_epi64(a6, a7);
+        const __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(u0), u1, 1);
+        const __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(u2), u3, 1);
+        __m512i u;
+        u = _mm512_permutex2var_epi8(_mm512_castsi256_si512(v0), C, _mm512_castsi256_si512(v1));
+        u = _mm512_gf2p8affine_epi64_epi8(I8, u, 0x00);
+        _mm512_mask_storeu_epi64(&out[i * 8], k, u);
+    }
+}
+#endif
+
+#if defined(__AVX512BW__) && defined(__AVX512VL__) || BITSHUF_USE_IFUNC
+IMPLEMENT_LOAD_FUNCTION(LOAD_MASK64, __mmask64)
+
+ATTRIBUTE_TARGET("sse2")
+static ALWAYS_INLINE __mmask64 MM_CVTSI128_MASK64(__m128i a) {
+#if defined(__x86_64__) || defined(_M_X64)
+    return _mm_cvtsi128_si64(a);
+#else
+    __mmask64 k;
+    _mm_storel_epi64((__m128i*)&k, a);
+    return k;
+#endif
+}
+
+NO_INLINE
+ATTRIBUTE_TARGET("avx512bw,avx512vl")
+static void bitshuf_untrans_bit_avx512bw(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    const __m512i C0 = _mm512_set1_epi8(0x01);
+    const __m512i C1 = _mm512_set1_epi8(0x02);
+    const __m512i C2 = _mm512_set1_epi8(0x04);
+    const __m512i C3 = _mm512_set1_epi8(0x08);
+    const __m512i C4 = _mm512_set1_epi8(0x10);
+    const __m512i C5 = _mm512_set1_epi8(0x20);
+    const __m512i C6 = _mm512_set1_epi8(0x40);
+    const __m512i C7 = _mm512_set1_epi8(-128);
+    size_t i = 0;
+    for (; i + 8 <= size; i += 8) {
+        __m512i u = _mm512_maskz_mov_epi8(LOAD_MASK64(&in[0 * size + i]), C0);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[1 * size + i]), u, C1);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[2 * size + i]), u, C2);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[3 * size + i]), u, C3);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[4 * size + i]), u, C4);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[5 * size + i]), u, C5);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[6 * size + i]), u, C6);
+        u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[7 * size + i]), u, C7);
+        _mm512_storeu_si512(&out[i * 8], u);
+    }
+    if (i < size) {
+        const __mmask8 k = (1U << (size - i)) - 1;
+        const __mmask64 a0 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[0 * size + i]));
+        const __mmask64 a1 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[1 * size + i]));
+        const __mmask64 a2 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[2 * size + i]));
+        const __mmask64 a3 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[3 * size + i]));
+        const __mmask64 a4 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[4 * size + i]));
+        const __mmask64 a5 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[5 * size + i]));
+        const __mmask64 a6 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[6 * size + i]));
+        const __mmask64 a7 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[7 * size + i]));
+        __m512i u = _mm512_maskz_mov_epi8(a0, C0);
+        u = _mm512_mask_add_epi8(X(u), a1, u, C1);
+        u = _mm512_mask_add_epi8(X(u), a2, u, C2);
+        u = _mm512_mask_add_epi8(X(u), a3, u, C3);
+        u = _mm512_mask_add_epi8(X(u), a4, u, C4);
+        u = _mm512_mask_add_epi8(X(u), a5, u, C5);
+        u = _mm512_mask_add_epi8(X(u), a6, u, C6);
+        u = _mm512_mask_add_epi8(X(u), a7, u, C7);
+        _mm512_mask_storeu_epi64(&out[i * 8], k, u);
+    }
+}
+#endif
+
+#if defined(__AVX2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx2")
+static void bitshuf_untrans_bit_avx2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    const __m256i PERM = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    const __m256i MASK0 = _mm256_set1_epi64x(0x00aa00aa00aa00aa);
+    const __m256i MASK1 = _mm256_set1_epi64x(0x0000cccc0000cccc);
+    const __m256i MASK2 = _mm256_set1_epi64x(0x00000000f0f0f0f0);
+    size_t i = 0;
+    for (; i + 8 <= size; i += 8) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
+        const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
+        const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
+        const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
+        const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
+        __m256i u0 = MM256_SETR_M128I(_mm_unpacklo_epi8(a0, a1), _mm_unpacklo_epi8(a4, a5));
+        __m256i u1 = MM256_SETR_M128I(_mm_unpacklo_epi8(a2, a3), _mm_unpacklo_epi8(a6, a7));
+        __m256i v0 = _mm256_unpacklo_epi16(u0, u1);
+        __m256i v1 = _mm256_unpackhi_epi16(u0, u1);
+        u0 = _mm256_permutevar8x32_epi32(v0, PERM);
+        u1 = _mm256_permutevar8x32_epi32(v1, PERM);
+        v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 07)), MASK0);
+        v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 07)), MASK0);
+        u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 07)), v0);
+        u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 07)), v1);
+        v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 14)), MASK1);
+        v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 14)), MASK1);
+        u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 14)), v0);
+        u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 14)), v1);
+        v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 28)), MASK2);
+        v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 28)), MASK2);
+        u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 28)), v0);
+        u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 28)), v1);
+        _mm256_storeu_si256((__m256i*)&out[i * 8], u0);
+        _mm256_storeu_si256((__m256i*)&out[i * 8 + 32], u1);
+    }
+    if (i < size)
+        bitshuf_untrans_bit_tail(out, in, size * 8, i);
+}
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_untrans_bit_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+    size /= 8;
+
+    const __m128i MASK0 = _mm_set1_epi64x(0x00aa00aa00aa00aa);
+    const __m128i MASK1 = _mm_set1_epi64x(0x0000cccc0000cccc);
+    const __m128i MASK2 = _mm_set1_epi64x(0x00000000f0f0f0f0);
+    size_t i = 0;
+    for (; i + 8 <= size; i += 8) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
+        const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
+        const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
+        const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
+        const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
+        __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+        __m128i u2 = _mm_unpacklo_epi8(a4, a5);
+        __m128i u3 = _mm_unpacklo_epi8(a6, a7);
+        __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+        __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+        __m128i v2 = _mm_unpacklo_epi16(u2, u3);
+        __m128i v3 = _mm_unpackhi_epi16(u2, u3);
+        u0 = _mm_unpacklo_epi32(v0, v2);
+        u1 = _mm_unpackhi_epi32(v0, v2);
+        u2 = _mm_unpacklo_epi32(v1, v3);
+        u3 = _mm_unpackhi_epi32(v1, v3);
+        v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 07)), MASK0);
+        v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 07)), MASK0);
+        v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 07)), MASK0);
+        v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 07)), MASK0);
+        u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 07)), v0);
+        u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 07)), v1);
+        u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 07)), v2);
+        u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 07)), v3);
+        v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 14)), MASK1);
+        v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 14)), MASK1);
+        v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 14)), MASK1);
+        v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 14)), MASK1);
+        u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 14)), v0);
+        u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 14)), v1);
+        u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 14)), v2);
+        u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 14)), v3);
+        v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 28)), MASK2);
+        v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 28)), MASK2);
+        v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 28)), MASK2);
+        v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 28)), MASK2);
+        u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 28)), v0);
+        u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 28)), v1);
+        u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 28)), v2);
+        u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 28)), v3);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3);
+    }
+    if (i < size)
+        bitshuf_untrans_bit_tail(out, in, size * 8, i);
+}
+#endif
+
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__)
+#define bitshuf_untrans_bit bitshuf_untrans_bit_avx512vbmi_gfni
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_untrans_bit_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx512vbmi") && __builtin_cpu_supports("avx512vl") &&
+        __builtin_cpu_supports("gfni"))
+    {
+        return bitshuf_untrans_bit_avx512vbmi_gfni;
+    }
+#if defined(__AVX512BW__) && defined(__AVX512VL__)
+    return bitshuf_untrans_bit_avx512bw;
+#else
+    if (__builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512vl"))
+        return bitshuf_untrans_bit_avx512bw;
+#if defined(__AVX2__)
+    return bitshuf_untrans_bit_avx2;
+#else
+    if (__builtin_cpu_supports("avx2"))
+        return bitshuf_untrans_bit_avx2;
+#if defined(__SSE2__)
+    return bitshuf_untrans_bit_sse2;
+#else
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_untrans_bit_sse2;
+
+    return bitshuf_untrans_bit;
+#endif
+#endif
+#endif
+}
+#define bitshuf_untrans_bit bitshuf_untrans_bit_ifunc
+#elif defined(__AVX512BW__) && defined(__AVX512VL__)
+#define bitshuf_untrans_bit bitshuf_untrans_bit_avx512bw
+#elif defined(__AVX2__)
+#define bitshuf_untrans_bit bitshuf_untrans_bit_avx2
+#elif defined(__SSE2__)
+#define bitshuf_untrans_bit bitshuf_untrans_bit_sse2
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_untrans_byte_sse2(char* restrict out,
+                                      const char* restrict in,
+                                      size_t size,
+                                      size_t elem_size) {
+    assert(size % 8 == 0);
+
+    size_t j = 0;
+    for (; j + 8 <= elem_size; j += 8) {
+        for (size_t i = 0; i < size; i += 8) {
+            const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(j + 0) * size + i]);
+            const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(j + 1) * size + i]);
+            const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(j + 2) * size + i]);
+            const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(j + 3) * size + i]);
+            const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(j + 4) * size + i]);
+            const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(j + 5) * size + i]);
+            const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(j + 6) * size + i]);
+            const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(j + 7) * size + i]);
+            __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+            __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+            __m128i u2 = _mm_unpacklo_epi8(a4, a5);
+            __m128i u3 = _mm_unpacklo_epi8(a6, a7);
+            const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+            const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+            const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
+            const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
+            u0 = _mm_unpacklo_epi32(v0, v2);
+            u1 = _mm_unpackhi_epi32(v0, v2);
+            u2 = _mm_unpacklo_epi32(v1, v3);
+            u3 = _mm_unpackhi_epi32(v1, v3);
+            _mm_storel_epi64((__m128i*)&out[(i + 0) * elem_size + j], u0);
+            _mm_storel_epi64((__m128i*)&out[(i + 1) * elem_size + j], _mm_srli_si128(u0, 8));
+            _mm_storel_epi64((__m128i*)&out[(i + 2) * elem_size + j], u1);
+            _mm_storel_epi64((__m128i*)&out[(i + 3) * elem_size + j], _mm_srli_si128(u1, 8));
+            _mm_storel_epi64((__m128i*)&out[(i + 4) * elem_size + j], u2);
+            _mm_storel_epi64((__m128i*)&out[(i + 5) * elem_size + j], _mm_srli_si128(u2, 8));
+            _mm_storel_epi64((__m128i*)&out[(i + 6) * elem_size + j], u3);
+            _mm_storel_epi64((__m128i*)&out[(i + 7) * elem_size + j], _mm_srli_si128(u3, 8));
+        }
+    }
+    if (j < elem_size) {
+        const size_t j0 = (j + 0) * size;
+        const size_t j1 = (j + 1) < elem_size ? (j + 1) * size : 1;
+        const size_t j2 = (j + 2) % elem_size * size + (j + 2) / elem_size;
+        const size_t j3 = (j + 3) % elem_size * size + (j + 3) / elem_size;
+        const size_t j4 = (j + 4) % elem_size * size + (j + 4) / elem_size;
+        const size_t j5 = (j + 5) % elem_size * size + (j + 5) / elem_size;
+        const size_t j6 = (j + 6) % elem_size * size + (j + 6) / elem_size;
+        const size_t j7 = (j + 7) % elem_size * size + (j + 7) / elem_size;
+        for (size_t i = 0; i + 8 < size; i += 8) {
+            const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[j0 + i]);
+            const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[j1 + i]);
+            const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[j2 + i]);
+            const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[j3 + i]);
+            const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[j4 + i]);
+            const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[j5 + i]);
+            const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[j6 + i]);
+            const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[j7 + i]);
+            __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+            __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+            __m128i u2 = _mm_unpacklo_epi8(a4, a5);
+            __m128i u3 = _mm_unpacklo_epi8(a6, a7);
+            const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+            const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+            const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
+            const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
+            u0 = _mm_unpacklo_epi32(v0, v2);
+            u1 = _mm_unpackhi_epi32(v0, v2);
+            u2 = _mm_unpacklo_epi32(v1, v3);
+            u3 = _mm_unpackhi_epi32(v1, v3);
+            _mm_storel_epi64((__m128i*)&out[(i + 0) * elem_size + j], u0);
+            _mm_storel_epi64((__m128i*)&out[(i + 1) * elem_size + j], _mm_srli_si128(u0, 8));
+            _mm_storel_epi64((__m128i*)&out[(i + 2) * elem_size + j], u1);
+            _mm_storel_epi64((__m128i*)&out[(i + 3) * elem_size + j], _mm_srli_si128(u1, 8));
+            _mm_storel_epi64((__m128i*)&out[(i + 4) * elem_size + j], u2);
+            _mm_storel_epi64((__m128i*)&out[(i + 5) * elem_size + j], _mm_srli_si128(u2, 8));
+            _mm_storel_epi64((__m128i*)&out[(i + 6) * elem_size + j], u3);
+            _mm_storel_epi64((__m128i*)&out[(i + 7) * elem_size + j], _mm_srli_si128(u3, 8));
+        }
+        for (; j < elem_size; j++) {
+            for (size_t i = size - 8; i < size; i++)
+                out[i * elem_size + j] = in[j * size + i];
+        }
+    }
+}
+#endif
+
+#if defined(__SSE2__)
+#define bitshuf_untrans_byte bitshuf_untrans_byte_sse2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_untrans_byte_ifunc,
+                (char* restrict out, const char* restrict in, size_t size, size_t elem_size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_untrans_byte_sse2;
+
+    return bitshuf_untrans_byte;
+}
+#define bitshuf_untrans_byte bitshuf_untrans_byte_ifunc
+#endif
+
+#if defined(__AVX2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx2")
+static void bitshuf_untrans_byte_2_avx2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    size_t i = 0;
+    for (; i + 32 <= size; i += 32) {
+        const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[0 * size + i]);
+        const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[1 * size + i]);
+        const __m256i u0 = _mm256_permute4x64_epi64(a0, 0xd8);
+        const __m256i u1 = _mm256_permute4x64_epi64(a1, 0xd8);
+        const __m256i v0 = _mm256_unpacklo_epi8(u0, u1);
+        const __m256i v1 = _mm256_unpackhi_epi8(u0, u1);
+        _mm256_storeu_si256((__m256i*)&out[i * 2], v0);
+        _mm256_storeu_si256((__m256i*)&out[i * 2 + 32], v1);
+    }
+    if (i + 16 <= size) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        _mm_storeu_si128((__m128i*)&out[i * 2], u0);
+        _mm_storeu_si128((__m128i*)&out[i * 2 + 16], u1);
+        i += 16;
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i u = _mm_unpacklo_epi8(a0, a1);
+        _mm_storeu_si128((__m128i*)&out[i * 2], u);
+    }
+}
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_untrans_byte_2_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        _mm_storeu_si128((__m128i*)&out[i * 2], u0);
+        _mm_storeu_si128((__m128i*)&out[i * 2 + 16], u1);
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i u = _mm_unpacklo_epi8(a0, a1);
+        _mm_storeu_si128((__m128i*)&out[i * 2], u);
+    }
+}
+#endif
+
+#if defined(__AVX2__)
+#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_avx2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_untrans_byte_2_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx2"))
+        return bitshuf_untrans_byte_2_avx2;
+#if defined(__SSE2__)
+    return bitshuf_untrans_byte_2_sse2;
+#else
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_untrans_byte_2_sse2;
+
+    return bitshuf_untrans_byte_2;
+#endif
+}
+#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_ifunc
+#elif defined(__SSE2__)
+#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_sse2
+#endif
+
+#if defined(__AVX2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("avx2")
+static void bitshuf_untrans_byte_4_avx2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    size_t i = 0;
+    for (; i + 32 <= size; i += 32) {
+        const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[0 * size + i]);
+        const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[1 * size + i]);
+        const __m256i a2 = _mm256_loadu_si256((const __m256i*)&in[2 * size + i]);
+        const __m256i a3 = _mm256_loadu_si256((const __m256i*)&in[3 * size + i]);
+        __m256i u0 = _mm256_unpacklo_epi8(a0, a1);
+        __m256i u1 = _mm256_unpackhi_epi8(a0, a1);
+        __m256i u2 = _mm256_unpacklo_epi8(a2, a3);
+        __m256i u3 = _mm256_unpackhi_epi8(a2, a3);
+        const __m256i v0 = _mm256_unpacklo_epi16(u0, u2);
+        const __m256i v1 = _mm256_unpackhi_epi16(u0, u2);
+        const __m256i v2 = _mm256_unpacklo_epi16(u1, u3);
+        const __m256i v3 = _mm256_unpackhi_epi16(u1, u3);
+        u0 = _mm256_inserti128_si256(v0, _mm256_castsi256_si128(v1), 1);
+        u1 = _mm256_inserti128_si256(v2, _mm256_castsi256_si128(v3), 1);
+        u2 = _mm256_permute2x128_si256(v0, v1, 0x31);
+        u3 = _mm256_permute2x128_si256(v2, v3, 0x31);
+        _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 0], u0);
+        _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 1], u1);
+        _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 2], u2);
+        _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 3], u3);
+    }
+    if (i + 16 <= size) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        const __m128i u2 = _mm_unpacklo_epi8(a2, a3);
+        const __m128i u3 = _mm_unpackhi_epi8(a2, a3);
+        const __m128i v0 = _mm_unpacklo_epi16(u0, u2);
+        const __m128i v1 = _mm_unpackhi_epi16(u0, u2);
+        const __m128i v2 = _mm_unpacklo_epi16(u1, u3);
+        const __m128i v3 = _mm_unpackhi_epi16(u1, u3);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 0], v0);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 1], v1);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 2], v2);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 3], v3);
+        i += 16;
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        const __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+        const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+        const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+        _mm_storeu_si128((__m128i*)&out[i * 4], v0);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16], v1);
+    }
+}
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_untrans_byte_4_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        const __m128i u2 = _mm_unpacklo_epi8(a2, a3);
+        const __m128i u3 = _mm_unpackhi_epi8(a2, a3);
+        const __m128i v0 = _mm_unpacklo_epi16(u0, u2);
+        const __m128i v1 = _mm_unpackhi_epi16(u0, u2);
+        const __m128i v2 = _mm_unpacklo_epi16(u1, u3);
+        const __m128i v3 = _mm_unpackhi_epi16(u1, u3);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 0], v0);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 1], v1);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 2], v2);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 3], v3);
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
+        const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        const __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+        const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+        const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+        _mm_storeu_si128((__m128i*)&out[i * 4], v0);
+        _mm_storeu_si128((__m128i*)&out[i * 4 + 16], v1);
+    }
+}
+#endif
+
+#if defined(__AVX2__)
+#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_avx2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_untrans_byte_4_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx2"))
+        return bitshuf_untrans_byte_4_avx2;
+#if defined(__SSE2__)
+    return bitshuf_untrans_byte_4_sse2;
+#else
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_untrans_byte_4_sse2;
+
+    return bitshuf_untrans_byte_4;
+#endif
+}
+#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_ifunc
+#elif defined(__SSE2__)
+#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_sse2
+#endif
+
+#if defined(__SSE2__) || BITSHUF_USE_IFUNC
+NO_INLINE
+ATTRIBUTE_TARGET("sse2")
+static void bitshuf_untrans_byte_8_sse2(char* restrict out, const char* restrict in, size_t size) {
+    assert(size % 8 == 0);
+
+    size_t i = 0;
+    for (; i + 16 <= size; i += 16) {
+        const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]);
+        const __m128i a4 = _mm_loadu_si128((const __m128i*)&in[4 * size + i]);
+        const __m128i a5 = _mm_loadu_si128((const __m128i*)&in[5 * size + i]);
+        const __m128i a6 = _mm_loadu_si128((const __m128i*)&in[6 * size + i]);
+        const __m128i a7 = _mm_loadu_si128((const __m128i*)&in[7 * size + i]);
+        __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        __m128i u1 = _mm_unpackhi_epi8(a0, a1);
+        __m128i u2 = _mm_unpacklo_epi8(a2, a3);
+        __m128i u3 = _mm_unpackhi_epi8(a2, a3);
+        __m128i u4 = _mm_unpacklo_epi8(a4, a5);
+        __m128i u5 = _mm_unpackhi_epi8(a4, a5);
+        __m128i u6 = _mm_unpacklo_epi8(a6, a7);
+        __m128i u7 = _mm_unpackhi_epi8(a6, a7);
+        const __m128i v0 = _mm_unpacklo_epi16(u0, u2);
+        const __m128i v1 = _mm_unpackhi_epi16(u0, u2);
+        const __m128i v2 = _mm_unpacklo_epi16(u1, u3);
+        const __m128i v3 = _mm_unpackhi_epi16(u1, u3);
+        const __m128i v4 = _mm_unpacklo_epi16(u4, u6);
+        const __m128i v5 = _mm_unpackhi_epi16(u4, u6);
+        const __m128i v6 = _mm_unpacklo_epi16(u5, u7);
+        const __m128i v7 = _mm_unpackhi_epi16(u5, u7);
+        u0 = _mm_unpacklo_epi32(v0, v4);
+        u1 = _mm_unpackhi_epi32(v0, v4);
+        u2 = _mm_unpacklo_epi32(v1, v5);
+        u3 = _mm_unpackhi_epi32(v1, v5);
+        u4 = _mm_unpacklo_epi32(v2, v6);
+        u5 = _mm_unpackhi_epi32(v2, v6);
+        u6 = _mm_unpacklo_epi32(v3, v7);
+        u7 = _mm_unpackhi_epi32(v3, v7);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 4], u4);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 5], u5);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 6], u6);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 7], u7);
+    }
+    if (i + 8 <= size) {
+        const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
+        const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
+        const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
+        const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
+        const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
+        const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
+        const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
+        const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
+        __m128i u0 = _mm_unpacklo_epi8(a0, a1);
+        __m128i u1 = _mm_unpacklo_epi8(a2, a3);
+        __m128i u2 = _mm_unpacklo_epi8(a4, a5);
+        __m128i u3 = _mm_unpacklo_epi8(a6, a7);
+        const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
+        const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
+        const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
+        const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
+        u0 = _mm_unpacklo_epi32(v0, v2);
+        u1 = _mm_unpackhi_epi32(v0, v2);
+        u2 = _mm_unpacklo_epi32(v1, v3);
+        u3 = _mm_unpackhi_epi32(v1, v3);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2);
+        _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3);
+    }
+}
+#endif
+
+#if defined(__SSE2__)
+#define bitshuf_untrans_byte_8 bitshuf_untrans_byte_8_sse2
+#elif BITSHUF_USE_IFUNC
+IMPLEMENT_IFUNC(bitshuf_untrans_byte_8_ifunc,
+                (char* restrict out, const char* restrict in, size_t size)) {
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("sse2"))
+        return bitshuf_untrans_byte_8_sse2;
+
+    return bitshuf_untrans_byte_8;
+}
+#define bitshuf_untrans_byte_8 bitshuf_untrans_byte_8_ifunc
+#endif
+
+#endif
+
+int bitshuf_encode_block(char* restrict out,
+                         const char* restrict in,
+                         char* restrict scratch,
+                         size_t size,
+                         size_t elem_size) {
+    if (UNLIKELY(size & 7))
+        return -1;
+
+    if (elem_size == 1) {
+        bitshuf_trans_bit(out, in, size);
+    } else {
+        if (UNLIKELY(!scratch && elem_size > 1))
+            return -1;
+
+        switch (elem_size) {
+            case 2:
+                bitshuf_trans_byte_2(scratch, in, size);
+                break;
+            case 4:
+                bitshuf_trans_byte_4(scratch, in, size);
+                break;
+            case 8:
+                bitshuf_trans_byte_8(scratch, in, size);
+                break;
+            default:
+                bitshuf_trans_byte(scratch, in, size, elem_size);
+                break;
+        }
+        for (size_t i = 0; i < elem_size; i++)
+            bitshuf_trans_bit(&out[i * size], &scratch[i * size], size);
+    }
+    return 0;
+}
+
+int bitshuf_decode_block(char* restrict out,
+                         const char* restrict in,
+                         char* restrict scratch,
+                         size_t size,
+                         size_t elem_size) {
+    if (UNLIKELY(size & 7))
+        return -1;
+
+    if (elem_size == 1) {
+        bitshuf_untrans_bit(out, in, size);
+    } else {
+        if (UNLIKELY(!scratch && elem_size > 1))
+            return -1;
+
+        for (size_t i = 0; i < elem_size; i++)
+            bitshuf_untrans_bit(&scratch[i * size], &in[i * size], size);
+
+        switch (elem_size) {
+            case 2:
+                bitshuf_untrans_byte_2(out, scratch, size);
+                break;
+            case 4:
+                bitshuf_untrans_byte_4(out, scratch, size);
+                break;
+            case 8:
+                bitshuf_untrans_byte_8(out, scratch, size);
+                break;
+            default:
+                bitshuf_untrans_byte(out, scratch, size, elem_size);
+                break;
+        }
+    }
+    return 0;
+}
diff --git a/src/bitshuffle.h b/src/bitshuffle.h
new file mode 100644
index 0000000..3405dd0
--- /dev/null
+++ b/src/bitshuffle.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+/* Copyright (c) 2023 Kal Conley
+ */
+#ifndef BITSHUFFLE_H_
+#define BITSHUFFLE_H_
+
+#include <stddef.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Transpose bits for compression.
+ *
+ * This function performs Bitshuffle transposition of a single block. The block
+ * size in bytes is given by the product of `size` and `elem_size`.
+ *
+ * If required, the `scratch` argument must point to a buffer that the function
+ * uses for scratch purposes. The size of this buffer is given by the block
+ * size.
+ *
+ * On success, the function returns 0; otherwise, -1 is returned to indicate an
+ * error. In case of error, the memory pointed to by `out` and `scratch` is left
+ * unmodified.
+ *
+ * Pointer arguments of this function have C99 `restrict` semantics. If the
+ * `out`, `in`, or `scratch` buffers overlap, the behavior is undefined.
+ *
+ * Errors
+ * ------
+ * The function returns -1 to indicate an error if:
+ *
+ * - The `scratch` argument is `NULL` and a scratch buffer is required for the
+ *   specified element size.
+ * - The `size` argument is not a multiple of 8.
+ */
+int bitshuf_encode_block(char* out, const char* in, char* scratch, size_t size, size_t elem_size);
+
+/* Untranspose bits after decompression.
+ *
+ * This function performs the inverse of `bitshuf_encode_block()`.
+ *
+ * If required, the `scratch` argument must point to a buffer that the function
+ * uses for scratch purposes. The size of this buffer is given by the block
+ * size.
+ *
+ * On success, the function returns 0; otherwise, -1 is returned to indicate an
+ * error. In case of error, the memory pointed to by `out` and `scratch` is left
+ * unmodified.
+ *
+ * Pointer arguments of this function have C99 `restrict` semantics. If the
+ * `out`, `in`, or `scratch` buffers overlap, the behavior is undefined.
+ *
+ * Errors
+ * ------
+ * The function returns -1 to indicate an error if:
+ *
+ * - The `scratch` argument is `NULL` and a scratch buffer is required for the
+ *   specified element size.
+ * - The `size` argument is not a multiple of 8.
+ */
+int bitshuf_decode_block(char* out, const char* in, char* scratch, size_t size, size_t elem_size);
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif
+
+#endif /* BITSHUFFLE_H_ */
diff --git a/tests/bitshuffle_benchmark.cc b/tests/bitshuffle_benchmark.cc
new file mode 100644
index 0000000..d8c2d3f
--- /dev/null
+++ b/tests/bitshuffle_benchmark.cc
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT OR Apache-2.0
+// Copyright (c) 2023 Kal Conley
+#include "src/bitshuffle.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <cassert>
+#include <cstring>
+#include <memory>
+
+#include "benchmark/benchmark.h"
+#include "bitshuffle/src/bitshuffle_internals.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4100) // unreferenced formal parameter
+#pragma warning(disable : 4244) // conversion from 'type1' to 'type2', possible loss of data
+#endif
+
+template <int (*F)(char*, const char*, char*, const size_t, const size_t)>
+static void BM_bitshuffle(benchmark::State& state) {
+    const size_t elem_size = state.range(0);
+    const size_t size_bytes =
+        state.range(1) / (elem_size * BSHUF_BLOCKED_MULT) * (elem_size * BSHUF_BLOCKED_MULT);
+    const size_t size = size_bytes / elem_size;
+    if (size == 0) {
+        state.SkipWithError("size is zero");
+        return;
+    }
+    std::unique_ptr<char[]> out{new char[size_bytes]};
+    std::unique_ptr<char[]> in{new char[size_bytes]};
+    std::unique_ptr<char[]> scratch{new char[size_bytes]};
+    for (auto _ : state) {
+        benchmark::DoNotOptimize(out.get());
+        const int r = F(&out[0], &in[0], &scratch[0], size, elem_size);
+        assert(r == 0);
+        (void)r;
+        benchmark::ClobberMemory();
+    }
+}
+
+int bm_memcpy(char* out, const char* in, char* scratch, size_t size, size_t elem_size) {
+    std::memcpy(out, in, size * elem_size);
+    return 0;
+}
+
+int bm_bshuf_trans_bit_elem(char* out,
+                            const char* in,
+                            char* scratch,
+                            size_t size,
+                            size_t elem_size) {
+    const int64_t n = bshuf_trans_bit_elem(in, out, size, elem_size);
+    return static_cast<size_t>(n) == size * elem_size ? 0 : -1;
+}
+
+int bm_bshuf_untrans_bit_elem(char* out,
+                              const char* in,
+                              char* scratch,
+                              size_t size,
+                              size_t elem_size) {
+    const int64_t n = bshuf_untrans_bit_elem(in, out, size, elem_size);
+    return static_cast<size_t>(n) == size * elem_size ? 0 : -1;
+}
+
+#define memcpy bm_memcpy
+#define bshuf_trans_bit_elem bm_bshuf_trans_bit_elem
+#define bshuf_untrans_bit_elem bm_bshuf_untrans_bit_elem
+// clang-format off
+#define ELEM_SIZES {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 63, 64, 65, 66}
+#define SIZES {256, 1024, 4096, 8192, 16384, 1'000'000, 16'000'000, 256'000'000}
+// clang-format on
+BENCHMARK_TEMPLATE(BM_bitshuffle, memcpy)->ArgsProduct({{1}, SIZES});
+BENCHMARK_TEMPLATE(BM_bitshuffle, bitshuf_encode_block)->ArgsProduct({ELEM_SIZES, SIZES});
+BENCHMARK_TEMPLATE(BM_bitshuffle, bitshuf_decode_block)->ArgsProduct({ELEM_SIZES, SIZES});
+BENCHMARK_TEMPLATE(BM_bitshuffle, bshuf_trans_bit_elem)->ArgsProduct({ELEM_SIZES, SIZES});
+BENCHMARK_TEMPLATE(BM_bitshuffle, bshuf_untrans_bit_elem)->ArgsProduct({ELEM_SIZES, SIZES});
diff --git a/tests/bitshuffle_test.cc b/tests/bitshuffle_test.cc
new file mode 100644
index 0000000..053f186
--- /dev/null
+++ b/tests/bitshuffle_test.cc
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT OR Apache-2.0
+// Copyright (c) 2023 Kal Conley
+#include "src/bitshuffle.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <cassert>
+#include <cstring>
+#include <initializer_list>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#define BSHUF_BLOCKED_MULT 8
+
+template <typename T>
+class unaligned_allocator {
+public:
+    using value_type = T;
+
+    unaligned_allocator() = default;
+
+    template <typename U>
+    unaligned_allocator(unaligned_allocator<U> const&) noexcept {}
+
+    value_type* allocate(size_t n) {
+        return static_cast<value_type*>(::operator new((n + 1) * sizeof(value_type))) + 1;
+    }
+
+    void deallocate(value_type* p, size_t) noexcept { ::operator delete(p - 1); }
+};
+
+#define EXPECT_BITSHUF(ELEM_SIZE, ...)                                                        \
+    [](const std::vector<uint8_t, unaligned_allocator<uint8_t>>& u,                           \
+       const std::vector<uint8_t, unaligned_allocator<uint8_t>>& v, const size_t elem_size) { \
+        assert(u.size() == v.size());                                                         \
+        assert(u.size() % (BSHUF_BLOCKED_MULT * elem_size) == 0);                             \
+        std::vector<uint8_t, unaligned_allocator<uint8_t>> buf(u.size());                     \
+        std::vector<uint8_t, unaligned_allocator<uint8_t>> scratch(u.size());                 \
+        {                                                                                     \
+            std::memset(buf.data(), 0xcc, buf.size());                                        \
+            std::memset(scratch.data(), 0xcc, scratch.size());                                \
+            const int r = bitshuf_encode_block(                                               \
+                reinterpret_cast<char*>(buf.data()), reinterpret_cast<const char*>(u.data()), \
+                reinterpret_cast<char*>(scratch.data()), buf.size() / elem_size, elem_size);  \
+            ASSERT_EQ(r, 0);                                                                  \
+            if (std::memcmp(buf.data(), v.data(), buf.size()) != 0) {                         \
+                ASSERT_THAT(buf, ::testing::ElementsAreArray(v));                             \
+            }                                                                                 \
+        }                                                                                     \
+        {                                                                                     \
+            std::memset(buf.data(), 0xcc, buf.size());                                        \
+            std::memset(scratch.data(), 0xcc, scratch.size());                                \
+            const int r = bitshuf_decode_block(                                               \
+                reinterpret_cast<char*>(buf.data()), reinterpret_cast<const char*>(v.data()), \
+                reinterpret_cast<char*>(scratch.data()), buf.size() / elem_size, elem_size);  \
+            ASSERT_EQ(r, 0);                                                                  \
+            if (std::memcmp(buf.data(), u.data(), buf.size()) != 0) {                         \
+                ASSERT_THAT(buf, ::testing::ElementsAreArray(u));                             \
+            }                                                                                 \
+        }                                                                                     \
+    }(__VA_ARGS__, ELEM_SIZE)
+
+TEST(Bitshuffle, ScratchNull) {
+    char out[8], in[8];
+    std::memset(in, 0, sizeof(in));
+    for (size_t elem_size : {0, 1}) {
+        EXPECT_EQ(bitshuf_encode_block(out, in, NULL, 8, elem_size), 0);
+        EXPECT_EQ(bitshuf_decode_block(out, in, NULL, 8, elem_size), 0);
+    }
+}
+
+TEST(Bitshuffle, ScratchNullError) {
+    char out[1], in[1];
+    for (size_t elem_size = 2; elem_size <= 257; elem_size++) {
+        EXPECT_EQ(bitshuf_encode_block(out, in, NULL, 8, elem_size), -1);
+        EXPECT_EQ(bitshuf_decode_block(out, in, NULL, 8, elem_size), -1);
+    }
+}
+
+TEST(Bitshuffle, SizeZero) {
+    char out[1], in[1], scratch[1];
+    for (size_t elem_size = 0; elem_size <= 257; elem_size++) {
+        EXPECT_EQ(bitshuf_encode_block(out, in, scratch, 0, elem_size), 0);
+        EXPECT_EQ(bitshuf_decode_block(out, in, scratch, 0, elem_size), 0);
+    }
+}
+
+TEST(Bitshuffle, ErrorIfSizeNotMultipleOf8) {
+    char out[1], in[1], scratch[1];
+    for (size_t size : {1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17}) {
+        EXPECT_EQ(bitshuf_encode_block(out, in, scratch, size, 1), -1);
+        EXPECT_EQ(bitshuf_decode_block(out, in, scratch, size, 1), -1);
+    }
+}
+
+TEST(Bitshuffle, ElemSizeZero) {
+    char out[1], in[1], scratch[1];
+    for (size_t size = 0; size <= 256; size += BSHUF_BLOCKED_MULT) {
+        EXPECT_EQ(bitshuf_encode_block(out, in, scratch, size, 0), 0);
+        EXPECT_EQ(bitshuf_decode_block(out, in, scratch, size, 0), 0);
+    }
+}
+
+TEST(Bitshuffle, OneBitSetExhaustive) {
+    const size_t N = 120;
+    static_assert(N % BSHUF_BLOCKED_MULT == 0, "");
+
+    for (size_t size = 0; size <= N; size += BSHUF_BLOCKED_MULT) {
+        for (size_t elem_size = 0; elem_size <= 17; elem_size++) {
+            std::vector<uint8_t, unaligned_allocator<uint8_t>> u(size * elem_size);
+            std::vector<uint8_t, unaligned_allocator<uint8_t>> v(size * elem_size);
+            for (size_t i = 0; i < size; i++) {
+                for (size_t b = 0; b < elem_size * 8; b++) {
+                    u.assign(u.size(), 0);
+                    v.assign(v.size(), 0);
+                    u[(i * elem_size) + b / 8] = 1U << (b % 8);
+                    v[(b * size + i) / 8] = 1U << (i % 8);
+                    EXPECT_BITSHUF(elem_size, u, v);
+                }
+            }
+        }
+    }
+}
+
+TEST(Bitshuffle, OneBitZeroExhaustive) {
+    const size_t N = 120;
+    static_assert(N % BSHUF_BLOCKED_MULT == 0, "");
+
+    for (size_t size = 0; size <= N; size += BSHUF_BLOCKED_MULT) {
+        for (size_t elem_size = 0; elem_size <= 17; elem_size++) {
+            std::vector<uint8_t, unaligned_allocator<uint8_t>> u(size * elem_size);
+            std::vector<uint8_t, unaligned_allocator<uint8_t>> v(size * elem_size);
+            for (size_t i = 0; i < size; i++) {
+                for (size_t b = 0; b < elem_size * 8; b++) {
+                    u.assign(u.size(), 0xff);
+                    v.assign(v.size(), 0xff);
+                    u[(i * elem_size) + b / 8] = ~(1U << (b % 8));
+                    v[(b * size + i) / 8] = ~(1U << (i % 8));
+                    EXPECT_BITSHUF(elem_size, u, v);
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/BUILD.bazel b/third_party/BUILD.bazel
new file mode 100644
index 0000000..e69de29
diff --git a/third_party/bitshuffle.BUILD b/third_party/bitshuffle.BUILD
new file mode 100644
index 0000000..a4ed965
--- /dev/null
+++ b/third_party/bitshuffle.BUILD
@@ -0,0 +1,16 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "bitshuffle_core",
+    srcs = [
+        "src/bitshuffle_core.c",
+        "src/iochain.c",
+    ],
+    hdrs = [
+        "src/bitshuffle_core.h",
+        "src/bitshuffle_internals.h",
+        "src/iochain.h",
+    ],
+)