ROCm · alexkranias-amd · Jun 19, 2024 · Aug 6, 2024 · Aug 9, 2024 · Aug 28, 2024
diff --git a/.github/workflows/amd_tests.yml b/.github/workflows/amd_tests.yml
@@ -0,0 +1,68 @@
+name: AMD Perf Kernel Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main_perf]
+  merge_group:
+    branches: [main_perf]
+    types: [checks_requested]
+  push:
+    branches: [main_perf, micmelesse/upstream_pr]
+
+concurrency:
+  group: ${{ github.ref }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  Runner-Preparation-AMD:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    outputs:
+      matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
+    steps:
+      - name: Prepare runner matrix
+        id: set-matrix
+        run: |
+          if [ x"${{ github.repository }}" == x"ROCm/flash-attention" ]; then
+            echo '::set-output name=matrix-HIP::[["self-hosted", "rocm"]]'
+          else
+            echo '::set-output name=matrix-HIP::[["ubuntu-latest"]]'
+          fi
+
+  Integration-Tests-AMD:
+    needs: Runner-Preparation-AMD
+    if: needs.Runner-Preparation-AMD.outputs.matrix-HIP != ''
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation-AMD.outputs.matrix-HIP)}}
+    container:
+      image: rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2
+      options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Triton
+        run: |
+          pip uninstall -y triton
+          pip install matplotlib pandas pytest
+          git clone https://github.com/triton-lang/triton
+          cd triton
+          git checkout 2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88 
+          pip install --verbose -e python
+          cd ..
+      - name: Build
+        run: |
+          export FLASH_ATTENTION_USE_TRITON_ROCM="TRUE"
+          python setup.py install
+      - name: Flash Attention Tests
+        run: |
+          export FLASH_ATTENTION_USE_TRITON_ROCM="TRUE"
+          pytest tests/test_flash_attn.py
+      - name: AMD Kernel Tests
+        run: |
+          pytest -v -s flash_attn/flash_attn_triton_kernel_decode_amd.py::test_op_fwd
+          pytest -v -s flash_attn/flash_attn_triton_kernel_prefill_amd.py
diff --git a/.gitignore b/.gitignore
@@ -19,9 +19,13 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
+.eggs
 
 # IDE-related
 .idea/
 
 # Dev
-venv
+venv
+.venv
+scripts
+log
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,4 @@
 [submodule "csrc/composable_kernel"]
 	path = csrc/composable_kernel
 	url = https://github.com/ROCm/composable_kernel.git
+
diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ FlashAttention-2 with CUDA currently supports:
 3. All head dimensions up to 256. ~~Head dim > 192 backward requires A100/A800 or H100/H800~~. Head dim 256 backward now works on consumer GPUs (if there's no dropout) as of flash-attn 2.5.5.
 
 ### AMD ROCm Support
-ROCm version uses [composable_kernel](https://github.com/ROCm/composable_kernel) as the backend. It provides the implementation of FlashAttention-2.
+ROCm version has two backends. There is [composable_kernel](https://github.com/ROCm/composable_kernel) (ck) which is the default backend and a [Triton](https://github.com/triton-lang/triton) backend. They provide an implementation of FlashAttention-2.
 
 **Requirements:**
 - ROCm 6.0 and above.
@@ -121,10 +121,33 @@ We recommend the
 [Pytorch](https://hub.docker.com/r/rocm/pytorch)
 container from ROCm, which has all the required tools to install FlashAttention.
 
-FlashAttention-2 with ROCm currently supports:
+#### Composable Kernel Backend
+FlashAttention-2 ROCm CK backend currently supports:
 1. MI200 or MI300 GPUs.
 2. Datatype fp16 and bf16
 3. Forward's head dimensions up to 256. Backward head dimensions up to 128.
+#### Triton Backend
+FlashAttention-2 ROCm Triton backend is a work in progress. 
+It current supports Forwards only. However some features like PagedAttention and Sliding Window are missing. It can run on both MI and Navi Machines. We are working on backwards.
+
+Inorder to use the triton backend for rocm, follow the steps below.
+
+First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88).
+
+```
+git clone https://github.com/triton-lang/triton
+cd triton
+git checkout 2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88 
+pip install --verbose -e python
+```
+Then install and test Flash Attention with the flag `FLASH_ATTENTION_USE_TRITON_ROCM` set to `"TRUE"`.
+
+```
+export FLASH_ATTENTION_USE_TRITON_ROCM="TRUE"
+cd flash-attention
+python setup.py install
+pytest tests/test_flash_attn.py
+```
 
 
 ## How to use FlashAttention

diff --git a/csrc/composable_kernel b/csrc/composable_kernel
diff --git a/csrc/cutlass b/csrc/cutlass
diff --git a/flash_attn/flash_attn_interface.py b/flash_attn/flash_attn_interface.py
@@ -4,10 +4,15 @@
 
 import torch
 import torch.nn as nn
+import os
 
 # isort: off
 # We need to import the CUDA kernels after importing torch
-import flash_attn_2_cuda as flash_attn_cuda
+USE_TRITON_ROCM = os.getenv("FLASH_ATTENTION_USE_TRITON_ROCM", "FALSE") == "TRUE"
+if USE_TRITON_ROCM:
+    from flash_attn import flash_attn_triton_interface_amd as flash_attn_gpu
+else:
+    import flash_attn_2_cuda as flash_attn_gpu
 
 # isort: on
 
@@ -49,7 +54,7 @@ def _flash_attn_forward(
     q, k, v, dropout_p, softmax_scale, causal, window_size, softcap, alibi_slopes, return_softmax
 ):
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.fwd(
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_gpu.fwd(
         q,
         k,
         v,
@@ -87,7 +92,7 @@ def _flash_attn_varlen_forward(
     seqused_k=None,
 ):
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
         q,
         k,
         v,
@@ -141,7 +146,7 @@ def _flash_attn_backward(
         dk,
         dv,
         softmax_d,
-    ) = flash_attn_cuda.bwd(
+    ) = flash_attn_gpu.bwd(
         dout,
         q,
         k,
@@ -195,7 +200,7 @@ def _flash_attn_varlen_backward(
         dk,
         dv,
         softmax_d,
-    ) = flash_attn_cuda.varlen_bwd(
+    ) = flash_attn_gpu.varlen_bwd(
         dout,
         q,
         k,
@@ -1149,15 +1154,20 @@ def flash_attn_with_kvcache(
     v=None,
     rotary_cos=None,
     rotary_sin=None,
+    rotary_cos_k=None,
+    rotary_sin_k=None,
+    rotary_interleaved=True,
+    rotary_inplace=False,
+    rotary_conjugate=False,
     cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
     cache_batch_idx: Optional[torch.Tensor] = None,
     cache_leftpad: Optional[torch.Tensor] = None,
     block_table: Optional[torch.Tensor] = None,
     softmax_scale=None,
     causal=False,
+    local=False,
     window_size=(-1, -1),  # -1 means infinite context window
     softcap=0.0, # 0.0 means deactivated
-    rotary_interleaved=True,
     alibi_slopes=None,
     num_splits=0,
     return_softmax_lse=False,
@@ -1249,6 +1259,7 @@ def flash_attn_with_kvcache(
             logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
             normalization factor).
     """
+    # assert ALIBI is not ROTARY ?
     assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension"
     assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension"
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
@@ -1261,7 +1272,7 @@ def flash_attn_with_kvcache(
         cache_seqlens = maybe_contiguous(cache_seqlens)
     cache_batch_idx = maybe_contiguous(cache_batch_idx)
     block_table = maybe_contiguous(block_table)
-    out, softmax_lse = flash_attn_cuda.fwd_kvcache(
+    out, softmax_lse = flash_attn_gpu.fwd_kvcache(
         q,
         k_cache,
         v_cache,
@@ -1270,17 +1281,23 @@ def flash_attn_with_kvcache(
         cache_seqlens,
         rotary_cos,
         rotary_sin,
+        rotary_cos_k,
+        rotary_sin_k,
+        rotary_interleaved,
+        rotary_inplace,
+        rotary_conjugate,
+        cache_seqlens,
         cache_batch_idx,
         cache_leftpad,
         block_table,
         alibi_slopes,
         None,
         softmax_scale,
         causal,
+        local,
         window_size[0],
         window_size[1],
         softcap,
-        rotary_interleaved,
         num_splits,
     )
     return (out, softmax_lse) if return_softmax_lse else out