LuxDL · avik-pal · Oct 18, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -21,7 +21,7 @@ concurrency:
 
 jobs:
   ci:
-    name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }}
+    name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }} - ${{ matrix.loopvec }}
     if: ${{ !contains(github.event.head_commit.message, '[skip tests]') }}
     runs-on: ${{ matrix.os }}
     strategy:
@@ -43,27 +43,49 @@ jobs:
           - "others"
         blas_backend:
           - "default"
+        loopvec:
+          - "true"
         include:
           - os: ubuntu-latest
             test_group: "dense"
             blas_backend: "blis"
             version: "1.10"
+            loopvec: "true"
           - os: ubuntu-latest
             test_group: "dense"
             blas_backend: "mkl"
             version: "1.10"
+            loopvec: "true"
+          - os: ubuntu-latest
+            test_group: "dense"
+            blas_backend: "default"
+            version: "1.10"
+            loopvec: "false"
+          - os: ubuntu-latest
+            test_group: "batched_ops"
+            blas_backend: "default"
+            version: "1.10"
+            loopvec: "false"
+          - os: ubuntu-latest
+            test_group: "other_ops"
+            blas_backend: "default"
+            version: "1.10"
+            loopvec: "false"
           - os: macos-latest
             test_group: "dense"
             blas_backend: "appleaccelerate"
             version: "1.10"
+            loopvec: "true"
           - os: macos-latest
             test_group: "all"
             blas_backend: "default"
             version: "1.10"
+            loopvec: "true"
           - os: windows-latest
             test_group: "all"
             blas_backend: "default"
             version: "1.10"
+            loopvec: "true"
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
@@ -84,6 +106,7 @@ jobs:
         env:
           LUXLIB_TEST_GROUP: ${{ matrix.test_group }}
           LUXLIB_BLAS_BACKEND: ${{ matrix.blas_backend }}
+          LUXLIB_LOAD_LOOPVEC: ${{ matrix.loopvec }}
       - uses: julia-actions/julia-processcoverage@v1
         with:
           directories: src,ext

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <[email protected]> and contributors"]
-version = "1.3.3"
+version = "1.3.4"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -15,16 +15,14 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
+Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -36,7 +34,10 @@ BLISBLAS = "6f275bd8-fec0-4d39-945b-7e95a765fa1e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
@@ -46,7 +47,10 @@ LuxLibBLISBLASExt = "BLISBLAS"
 LuxLibCUDAExt = "CUDA"
 LuxLibMKLExt = "MKL"
 LuxLibEnzymeExt = "Enzyme"
+LuxLibLoopVectorizationExt = "LoopVectorization"
+LuxLibOctavianExt = ["Octavian", "LoopVectorization"]
 LuxLibReverseDiffExt = "ReverseDiff"
+LuxLibSLEEFPiratesExt = "SLEEFPirates"
 LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
 LuxLibTrackerExt = "Tracker"
 LuxLibcuDNNExt = ["CUDA", "cuDNN"]
@@ -75,6 +79,7 @@ MLDataDevices = "1.2"
 Markdown = "1.10"
 NNlib = "0.9.24"
 Octavian = "0.3.28"
+Preferences = "1.4.3"
 Polyester = "0.7.15"
 Random = "1.10"
 Reexport = "1"

diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
@@ -1,9 +1,11 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"

diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
@@ -3,6 +3,7 @@ using Pkg
 using BenchmarkTools
 using InteractiveUtils
 using LinearAlgebra
+using Octavian, LoopVectorization
 
 const SUITE = BenchmarkGroup()
 BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5

diff --git a/ext/LuxLibLoopVectorizationExt.jl b/ext/LuxLibLoopVectorizationExt.jl
@@ -0,0 +1,72 @@
+module LuxLibLoopVectorizationExt
+
+using LoopVectorization: LoopVectorization, @tturbo, @turbo, indices
+using Polyester: @batch
+using Static: True
+
+using LuxLib: LuxLib, Utils
+
+Utils.is_extension_loaded(::Val{:LoopVectorization}) = True()
+
+Utils.can_loopvec_args_check(::True, args...) = LoopVectorization.check_args(args...)
+
+# matmul
+for serial in (true, false)
+    opname = serial ? :serial_matmul_loopvec! : :matmul_loopvec!
+    @eval @inline function LuxLib.Impl.$(opname)(
+            C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number)
+        if !iszero(β) # Secial case this because Base.FastMath.mul_fast(NaN, false) = NaN
+            @turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1)
+                Cⱼₖ = zero(eltype(C))
+                for I in indices((A, B), (2, 1))
+                    Cⱼₖ += A[J, I] * B[I, K]
+                end
+                C[J, K] = α * Cⱼₖ + β * C[J, K]
+            end
+        else
+            @turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1)
+                Cⱼₖ = zero(eltype(C))
+                for I in indices((A, B), (2, 1))
+                    Cⱼₖ += A[J, I] * B[I, K]
+                end
+                C[J, K] = α * Cⱼₖ
+            end
+        end
+    end
+end
+
+@inline function LuxLib.Impl.matmuladd_loopvec!(
+        C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, bias::AbstractVector)
+    @tturbo for K in indices((C, B), 2), J in indices((C, A), 1)
+        Cⱼₖ = zero(eltype(C))
+        for I in indices((A, B), (2, 1))
+            Cⱼₖ += A[J, I] * B[I, K]
+        end
+        C[J, K] = bias[J] + Cⱼₖ
+    end
+    return
+end
+
+# batched matmul
+function LuxLib.Impl.batched_matmul_loopvec_impl!(
+        z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
+        y::AbstractArray{yT, 3}, α::Number=true, β::Number=false) where {zT, xT, yT}
+    if size(x, 3) == size(y, 3)
+        @batch for L in axes(z, 3)
+            LuxLib.Impl.serial_matmul_loopvec!(
+                Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, L), α, β)
+        end
+    elseif size(x, 3) == 1
+        @batch for L in axes(z, 3)
+            LuxLib.Impl.serial_matmul_loopvec!(
+                Utils.batchview(z, L), Utils.batchview(x, 1), Utils.batchview(y, L), α, β)
+        end
+    else # has to be size(y, 3) == 1
+        @batch for L in axes(z, 3)
+            LuxLib.Impl.serial_matmul_loopvec!(
+                Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, 1), α, β)
+        end
+    end
+end
+
+end
diff --git a/ext/LuxLibOctavianExt.jl b/ext/LuxLibOctavianExt.jl
@@ -0,0 +1,16 @@
+module LuxLibOctavianExt
+
+using Octavian: Octavian
+using Static: True
+
+using LuxLib: LuxLib, Utils
+
+Utils.is_extension_loaded(::Val{:Octavian}) = True()
+
+@inline function LuxLib.Impl.matmul_octavian!(
+        C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number)
+    Octavian.matmul!(C, A, B, α, β)
+    return
+end
+
+end
diff --git a/ext/LuxLibSLEEFPiratesExt.jl b/ext/LuxLibSLEEFPiratesExt.jl
@@ -0,0 +1,58 @@
+module LuxLibSLEEFPiratesExt
+
+using ChainRulesCore: ChainRulesCore
+using NNlib: NNlib
+using SLEEFPirates: SLEEFPirates
+
+using LuxLib: Numeric, Impl
+
+const CRC = ChainRulesCore
+
+sigmoid_fast(x::Number) = SLEEFPirates.sigmoid_fast(x)
+softplus(x::Number) = SLEEFPirates.softplus(x)
+logsigmoid(x::Number) = -softplus(-x)
+swish(x::Number) = Base.FastMath.mul_fast(x, sigmoid_fast(x))
+lisht(x::Number) = Base.FastMath.mul_fast(x, tanh_fast(x))
+tanh(x::Number) = SLEEFPirates.tanh(x)
+tanh_fast(x::Number) = SLEEFPirates.tanh_fast(x)
+
+for (f, dfdx) in [
+    #! format: off
+    (:sigmoid_fast, :(conj(Base.FastMath.mul_fast(Ω, Base.FastMath.sub_fast(1, Ω))))),
+    (:softplus, :(sigmoid_fast(x))),
+    (:logsigmoid, :(sigmoid_fast(-x))),
+    (:swish, :(Base.FastMath.add_fast(Ω, Base.FastMath.mul_fast(sigmoid_fast(x), Base.FastMath.sub_fast(1, Ω))))),
+    (:lisht, :(Base.FastMath.add_fast(x, Base.FastMath.mul_fast(tanh_fast(x), Base.FastMath.sub_fast(1, Ω))))),
+    (:tanh, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω))))),
+    (:tanh_fast, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω)))))
+    #! format: on
+]
+    @eval CRC.@scalar_rule($f(x), $(dfdx))
+
+    ∇f = Symbol(:∇broadcasted_, f)
+    @eval function CRC.rrule(::typeof(Broadcast.broadcasted), ::typeof($f),
+            x::Union{Numeric, Broadcast.Broadcasted})
+        Ω = $(f).(x)
+        function $(∇f)(dΩ)
+            ∂x = CRC.InplaceableThunk(dx -> @.(dx+=dΩ * $(dfdx)), CRC.@thunk @.(dΩ*$(dfdx)))
+            return CRC.NoTangent(), CRC.NoTangent(), ∂x
+        end
+        return Ω, $(∇f)
+    end
+end
+
+for (fbase, ffast) in [
+    #! format: off
+    (NNlib.sigmoid_fast, sigmoid_fast),
+    (NNlib.softplus, softplus),
+    (NNlib.logsigmoid, logsigmoid),
+    (NNlib.swish, swish),
+    (NNlib.lisht, lisht),
+    (Base.tanh, tanh),
+    (NNlib.tanh_fast, tanh_fast)
+    #! format: on
+]
+    @eval Impl.sleefpirates_fast_act(::typeof($fbase)) = $ffast
+end
+
+end
diff --git a/src/LuxLib.jl b/src/LuxLib.jl
@@ -1,6 +1,7 @@
 module LuxLib
 
 using Compat: @compat
+using Preferences: @load_preference
 using Reexport: @reexport
 using Static: Static, known
 
@@ -15,6 +16,8 @@ const Numeric = Union{AbstractArray{<:T}, T} where {T <: Number}
 const ∂∅ = NoTangent()
 const CRC = ChainRulesCore
 
+const DISABLE_LOOP_VECTORIZATION = @load_preference("disable_loop_vectorization", false)
+
 include("utils.jl")
 include("traits.jl")
 include("impl/Impl.jl")

diff --git a/src/api/activation.jl b/src/api/activation.jl
@@ -10,7 +10,7 @@ generic implementation.
     This function doesn't replace `σ` with `NNlib.fast_act(σ, ...)`, that needs to be
     done by the user if needed.
 
-!!! tip
+!!! tip "Load `SLEEFPirates.jl` to get faster activations"
 
     Certain activation functions are replaced with specialized implementations from
     [SLEEFPirates.jl](https://github.com/JuliaSIMD/SLEEFPirates.jl) for FP32. This might

diff --git a/src/api/batched_mul.jl b/src/api/batched_mul.jl
@@ -4,6 +4,11 @@
 Computes the batched matrix multiplication of `x` and `y`.  For more details see the NNlib
 documentation on `NNlib.batched_mul`. This function is mostly a wrapper around `batched_mul`
 but attempts to be faster on CPUs.
+
+!!! tip "Load `LoopVectorization.jl` to get faster batched matrix multiplication"
+
+    On CPUs loading LoopVectorization adds faster implementations of batched matrix
+    multiplication.
 """
 function batched_matmul(x::AbstractMatrix, y::AbstractArray{yT, 3}) where {yT}
     return batched_matmul(expand_batchdim(x), y)

diff --git a/src/api/dense.jl b/src/api/dense.jl
@@ -24,6 +24,11 @@ multiple operations.
   - For small CPU Arrays, we use LoopVectorization.jl. On `x86_64` we use Octavian for
     medium sized matrices. This is overridden if special BLAS implementations are loaded
     (currently `MKL`, `AppleAccelerate`, and `BLISBLAS`).
+
+!!! tip "Load `Octavian.jl`
+
+    Loading `Octavian.jl` enables a polyalgorithm that uses different backends based on the
+    input sizes.
 """
 function fused_dense_bias_activation(σ::F, weight::AbstractMatrix, x::AbstractMatrix,
         b::Optional{<:AbstractVector}) where {F}

diff --git a/src/impl/Impl.jl b/src/impl/Impl.jl
@@ -12,8 +12,6 @@ using ForwardDiff: ForwardDiff
 
 using KernelAbstractions: KernelAbstractions, @kernel, @Const, @index
 
-using LoopVectorization: LoopVectorization, @turbo, @tturbo, indices
-using Octavian: Octavian
 using Polyester: @batch
 
 using LinearAlgebra: LinearAlgebra, mul!
@@ -31,15 +29,14 @@ using ..Utils: Utils, NotaNumber, batchview, concrete_bias_act_output_eltype, co
                copy_drop_gradients, eltype_mismatch, expand_batchdim,
                maybe_reduce_BLAS_threads, ofeltype_array, only_derivative, remove_tracking,
                reset_BLAS_threads, run_ka_kernel, safe_eltype, safe_vec, safe_warning,
-               unsafe_known, unrolled_mapreduce, @enzyme_alternative
+               unsafe_known, unrolled_mapreduce, can_loopvec_args, @enzyme_alternative
 using ..Traits: activation_intermediate_not_needed, activation_has_rrule, is_mutable_array,
                 fuse_cpu_activation
 using ..System: explicit_blas_loaded, use_octavian, fits_in_l1cache, fits_in_l2cache,
                 fits_in_l3cache
 
 const CRC = ChainRulesCore
 const KA = KernelAbstractions
-const LV = LoopVectorization
 
 include("activation.jl")
 include("batched_mul.jl")