From 24c84bd2363a7195d928ce9beb568b66d8e64869 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Fri, 15 Apr 2022 12:19:02 +0100
Subject: [PATCH] [AArch64] Async unwind - Fix MTE codegen emitting frame
 adjustments in a loop

When untagging the stack, the compiler may emit a sequence like:
```
        .LBB0_1:
          st2g sp, [sp], #32
          sub x8, x8, #32
          cbnz x8, .LBB0_1
          stg sp, [sp], #16
```
These stack adjustments cannot be described by CFI instructions.

This patch disables merging of SP update with untagging, i.e. makes the
compiler use an additional scratch register (there should be plenty
available at this point as we are in the epilogue) and generate:
```
            mov     x9, sp
            mov     x8, #256
            stg     x9, [x9], #16
    .LBB0_1:
            sub     x8, x8, #32
            st2g    x9, [x9], #32
            cbnz    x8, .LBB0_1
            add     sp, sp, #272
```
Merging is disabled only when we need to generate asynchronous unwind
tables.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D114548
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 17 ++++++----
 .../Target/AArch64/AArch64RegisterInfo.cpp    | 32 ++++++++++++-------
 llvm/test/CodeGen/AArch64/settag.ll           | 31 +++++++++++++++---
 3 files changed, 58 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 82640f7e7690..0a237265a4ed 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3235,7 +3235,7 @@ class TagStoreEdit {
   // instructions. May skip if the replacement is not profitable. May invalidate
   // the input iterator and replace it with a valid one.
   void emitCode(MachineBasicBlock::iterator &InsertI,
-                const AArch64FrameLowering *TFI, bool IsLast);
+                const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
 };
 
 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
@@ -3374,7 +3374,8 @@ void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
 }
 
 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
-                            const AArch64FrameLowering *TFI, bool IsLast) {
+                            const AArch64FrameLowering *TFI,
+                            bool TryMergeSPUpdate) {
   if (TagStores.empty())
     return;
   TagStoreInstr &FirstTagStore = TagStores[0];
@@ -3404,8 +3405,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
     emitUnrolled(InsertI);
   } else {
     MachineInstr *UpdateInstr = nullptr;
-    int64_t TotalOffset;
-    if (IsLast) {
+    int64_t TotalOffset = 0;
+    if (TryMergeSPUpdate) {
       // See if we can merge base register update into the STGloop.
       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
       // but STGloop is way too unusual for that, and also it only
@@ -3550,7 +3551,7 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
   for (auto &Instr : Instrs) {
     if (EndOffset && *EndOffset != Instr.Offset) {
       // Found a gap.
-      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+      TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
       TSE.clear();
     }
 
@@ -3558,7 +3559,11 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
     EndOffset = Instr.Offset + Instr.Size;
   }
 
-  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+  // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
+  TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
+               !MBB->getParent()
+                    ->getInfo<AArch64FunctionInfo>()
+                    ->needsAsyncDwarfUnwindInfo());
 
   return InsertI;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 6f42bafa4a37..ffde7f0ac5ef 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -589,23 +589,31 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
 
 // Create a scratch register for the frame index elimination in an instruction.
 // This function has special handling of stack tagging loop pseudos, in which
-// case it can also change the instruction opcode (but not the operands).
+// case it can also change the instruction opcode.
 static Register
-createScratchRegisterForInstruction(MachineInstr &MI,
+createScratchRegisterForInstruction(MachineInstr &MI, unsigned FIOperandNum,
                                     const AArch64InstrInfo *TII) {
   // ST*Gloop have a reserved scratch register in operand 1. Use it, and also
   // replace the instruction with the writeback variant because it will now
   // satisfy the operand constraints for it.
-  if (MI.getOpcode() == AArch64::STGloop) {
-    MI.setDesc(TII->get(AArch64::STGloop_wback));
-    return MI.getOperand(1).getReg();
-  } else if (MI.getOpcode() == AArch64::STZGloop) {
-    MI.setDesc(TII->get(AArch64::STZGloop_wback));
-    return MI.getOperand(1).getReg();
+  Register ScratchReg;
+  if (MI.getOpcode() == AArch64::STGloop ||
+      MI.getOpcode() == AArch64::STZGloop) {
+    assert(FIOperandNum == 3 &&
+           "Wrong frame index operand for STGloop/STZGloop");
+    unsigned Op = MI.getOpcode() == AArch64::STGloop ? AArch64::STGloop_wback
+                                                     : AArch64::STZGloop_wback;
+    ScratchReg = MI.getOperand(1).getReg();
+    MI.getOperand(3).ChangeToRegister(ScratchReg, false, false, true);
+    MI.setDesc(TII->get(Op));
+    MI.tieOperands(1, 3);
   } else {
-    return MI.getMF()->getRegInfo().createVirtualRegister(
-        &AArch64::GPR64RegClass);
+    ScratchReg =
+        MI.getMF()->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    MI.getOperand(FIOperandNum)
+        .ChangeToRegister(ScratchReg, false, false, true);
   }
+  return ScratchReg;
 }
 
 void AArch64RegisterInfo::getOffsetOpcodes(
@@ -722,9 +730,9 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
+  Register ScratchReg =
+      createScratchRegisterForInstruction(MI, FIOperandNum, TII);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll
index 1525249a1713..7f01b1fec755 100644
--- a/llvm/test/CodeGen/AArch64/settag.ll
+++ b/llvm/test/CodeGen/AArch64/settag.ll
@@ -146,14 +146,12 @@ entry:
   ret void
 }
 
-define void @stg_alloca17() uwtable {
+define void @stg_alloca17() nounwind {
 ; CHECK-LABEL: stg_alloca17:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #288
-; CHECK-NEXT:    .cfi_def_cfa_offset 288
-; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
 ; CHECK-NEXT:  .LBB11_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    st2g sp, [sp], #32
@@ -161,6 +159,31 @@ define void @stg_alloca17() uwtable {
 ; CHECK-NEXT:    cbnz x8, .LBB11_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    stg sp, [sp], #16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca i8, i32 272, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 272)
+  ret void
+}
+
+define void @stg_alloca18() uwtable {
+; CHECK-LABEL: stg_alloca18:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #288
+; CHECK-NEXT:    .cfi_def_cfa_offset 288
+; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    stg x9, [x9], #16
+; CHECK-NEXT:  .LBB12_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub x8, x8, #32
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    cbnz x8, .LBB12_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    add sp, sp, #272
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0