diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index d0f8fe23831d78..4bdbd5d1cb02f3 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -540,7 +540,7 @@ // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH -#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_RAX | RBM_R10 | RBM_R11) +#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_CALLEE_TRASH & ~(RBM_ARG_REGS | RBM_FLTARG_REGS)) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R10 | RBM_RCX)) #define RBM_VALIDATE_INDIRECT_CALL_TRASH_ALL (RBM_INT_CALLEE_TRASH_ALL & ~(RBM_R10 | RBM_RCX)) diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 3f26dfab09ee93..f4edfbf7723b20 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -263,7 +263,7 @@ // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH -#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_R12 | RBM_R13 | RBM_R14 | RBM_R15) +#define RBM_INTERFACELOOKUP_FOR_SLOT_TRASH (RBM_CALLEE_TRASH & ~(RBM_ARG_REGS | RBM_FLTARG_REGS)) #define RBM_INTERFACELOOKUP_FOR_SLOT_RETURN RBM_R15 #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R0 | RBM_R1 | RBM_R2 | RBM_R3 | RBM_R4 | RBM_R5 | RBM_R6 | RBM_R7 | RBM_R8 | RBM_R15)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_R15 diff --git a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc index 37e1f6d000dfcb..41b05098f86e7c 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc +++ b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc @@ -359,6 +359,9 @@ PTFF_SAVE_ALL_PRESERVED equ 000000F7h ;; NOTE: RBP is not included in this set PTFF_SAVE_RSP equ 00008000h PTFF_SAVE_RAX equ 00000100h ;; RAX is saved in hijack handler - in case it contains a GC ref PTFF_SAVE_RCX equ 00000200h ;; RCX is saved in hijack handler - in case it contains a GC ref +PTFF_SAVE_RDX equ 00000400h ;; RDX is saved in hijack handler - in case it contains a GC ref +PTFF_SAVE_R8 equ 00000800h ;; R8 is saved in hijack handler - in case it contains a GC ref +PTFF_SAVE_R9 equ 00001000h ;; R9 is saved in hijack handler - in case it contains a GC ref PTFF_SAVE_ALL_SCRATCH equ 00007F00h PTFF_THREAD_HIJACK equ 00100000h ;; indicates that this is a frame for a hijacked call diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index 07f43434a8532d..06ba8656b44c6b 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -6,12 +6,13 @@ #include // -// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves RAX/RCX/RDX and accepts the register -// bitmask in R8 +// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves volatile argument registers +// and accepts the register bitmask // // On entry: // - BITMASK: bitmask describing pushes, may be volatile register or constant value // - RAX: managed function return value, may be an object or byref +// - RSI, RDI, RCX, RDX, R8, R9: may contain objects or byrefs at the hijack point // - preserved regs: need to stay preserved, may contain objects or byrefs // // INVARIANTS @@ -19,29 +20,39 @@ // - All preserved registers remain unchanged from their values in managed code. // .macro PUSH_PROBE_FRAME threadReg, trashReg, BITMASK + push_register r9 // save R9, it might contain an objectref + push_register r8 // save R8, it might contain an objectref push_register rdx // save RDX, it might contain an objectref push_register rcx // save RCX, it might contain an objectref (async continuation) push_register rax // save RAX, it might contain an objectref - lea \trashReg, [rsp + 0x20] + lea \trashReg, [rsp + 0x30] push_register \trashReg // save caller`s RSP push_nonvol_reg r15 // save preserved registers push_nonvol_reg r14 // .. push_nonvol_reg r13 // .. push_nonvol_reg r12 // .. + push_register rdi // save RDI, volatile on Unix, might contain an objectref + push_register rsi // save RSI, volatile on Unix, might contain an objectref push_nonvol_reg rbx // .. push_register \BITMASK // save the register bitmask passed in by caller push_register \threadReg // Thread * (unused by stackwalker) push_nonvol_reg rbp // save caller`s RBP - mov \trashReg, [rsp + 12*8] // Find the return address + mov \trashReg, [rsp + 16*8] // Find the return address push_register \trashReg // save m_RIP lea \trashReg, [rsp + 0] // trashReg == address of frame - // allocate space for xmm0, xmm1 and alignment - alloc_stack 0x20 + 0 + // allocate space for xmm0..xmm7 (FP argument registers) + alloc_stack 0x80 + 0 - // save xmm0 and xmm1 in case they are used as return values - movdqa [rsp + 0x10], xmm0 - movdqa [rsp + 0] , xmm1 + // save FP argument registers in case they contain live values at the hijack point + movdqa [rsp + 0x70], xmm0 + movdqa [rsp + 0x60], xmm1 + movdqa [rsp + 0x50], xmm2 + movdqa [rsp + 0x40], xmm3 + movdqa [rsp + 0x30], xmm4 + movdqa [rsp + 0x20], xmm5 + movdqa [rsp + 0x10], xmm6 + movdqa [rsp + 0x00], xmm7 // link the frame into the Thread mov [\threadReg + OFFSETOF__Thread__m_pDeferredTransitionFrame], \trashReg @@ -52,21 +63,31 @@ // registers and return value to their values from before the probe was called (while also updating any // object refs or byrefs). .macro POP_PROBE_FRAME - movdqa xmm1, [rsp + 0] - movdqa xmm0, [rsp + 0x10] - add rsp, 0x20 + 8 // skip xmm0, xmm1 and discard RIP - pop rbp - pop rax // discard Thread* - pop rax // discard BITMASK - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - pop rax // discard caller RSP - pop rax - pop rcx - pop rdx + movdqa xmm7, [rsp + 0x00] + movdqa xmm6, [rsp + 0x10] + movdqa xmm5, [rsp + 0x20] + movdqa xmm4, [rsp + 0x30] + movdqa xmm3, [rsp + 0x40] + movdqa xmm2, [rsp + 0x50] + movdqa xmm1, [rsp + 0x60] + movdqa xmm0, [rsp + 0x70] + free_stack 0x80 + 8 // skip xmm0..xmm7 and discard RIP + pop_nonvol_reg rbp + pop_register rax // discard Thread* + pop_register rax // discard BITMASK + pop_nonvol_reg rbx + pop_register rsi + pop_register rdi + pop_nonvol_reg r12 + pop_nonvol_reg r13 + pop_nonvol_reg r14 + pop_nonvol_reg r15 + pop_register rax // discard caller RSP + pop_register rax + pop_register rcx + pop_register rdx + pop_register r8 + pop_register r9 .endm // @@ -78,38 +99,69 @@ // // Register state on exit: // R11: thread pointer -// RAX, RCX, RDX preserved, other volatile regs trashed +// RAX, RCX, RDX, RSI, RDI, R8, R9, xmm0-xmm7 preserved, R10 trashed // .macro FixupHijackedCallstack - // preserve RAX, RDX as they may contain return values - push rax - push rdx + // preserve volatile argument registers across INLINE_GETTHREAD + push_register rax + push_register rdx // preserve RCX as it may contain async continuation return value - push rcx - - // align stack - sub rsp, 0x8 + push_register rcx + + // preserve RSI, RDI, R8 and R9 as they may contain GC refs + push_register rsi + push_register rdi + push_register r8 + push_register r9 + + // allocate space for xmm0..xmm7 + alignment (0x80 for xmm regs + 0x8 for 16-byte alignment) + alloc_stack 0x88 + + // save FP argument registers that would be clobbered by INLINE_GETTHREAD call + movdqa [rsp + 0x70], xmm0 + movdqa [rsp + 0x60], xmm1 + movdqa [rsp + 0x50], xmm2 + movdqa [rsp + 0x40], xmm3 + movdqa [rsp + 0x30], xmm4 + movdqa [rsp + 0x20], xmm5 + movdqa [rsp + 0x10], xmm6 + movdqa [rsp + 0x00], xmm7 // rax = GetThread(), makes nested calls INLINE_GETTHREAD mov r11, rax - add rsp, 0x8 + // restore FP argument registers + movdqa xmm7, [rsp + 0x00] + movdqa xmm6, [rsp + 0x10] + movdqa xmm5, [rsp + 0x20] + movdqa xmm4, [rsp + 0x30] + movdqa xmm3, [rsp + 0x40] + movdqa xmm2, [rsp + 0x50] + movdqa xmm1, [rsp + 0x60] + movdqa xmm0, [rsp + 0x70] + + free_stack 0x88 + + pop_register r9 + pop_register r8 + pop_register rdi + pop_register rsi - pop rcx + pop_register rcx - pop rdx - pop rax + pop_register rdx + pop_register rax // Fix the stack by pushing the original return address - mov r8, [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress] - push r8 + mov r10, [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress] + push r10 // Clear hijack state - xor r8, r8 - mov [r11 + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r8 - mov [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress], r8 + xor r10, r10 + mov [r11 + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r10 + mov [r11 + OFFSETOF__Thread__m_pvHijackedReturnAddress], r10 .endm // @@ -124,12 +176,12 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler ret LOCAL_LABEL(WaitForGC): - mov r8d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_THREAD_HIJACK + mov r10d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RSI + PTFF_SAVE_RDI + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 + PTFF_THREAD_HIJACK jmp C_FUNC(RhpWaitForGC) NESTED_END RhpGcProbeHijack, _TEXT NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler - PUSH_PROBE_FRAME r11, rax, r8 + PUSH_PROBE_FRAME r11, rax, r10 END_PROLOGUE mov rbx, r11 diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm index b66d950b1f7327..ad4254edf586f3 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.asm @@ -4,13 +4,14 @@ include AsmMacros.inc ;; -;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves RAX/RCX and accepts -;; the register bitmask +;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves volatile argument registers +;; and accepts the register bitmask ;; ;; On entry: ;; - BITMASK: bitmask describing pushes, a volatile register ;; - RAX: managed function return value, may be an object or byref ;; - RCX: managed function return value (async continuation), may be an object +;; - RDX, R8, R9: may contain objects or byrefs at the hijack point ;; - preserved regs: need to stay preserved, may contain objects or byrefs ;; ;; INVARIANTS @@ -19,9 +20,12 @@ include AsmMacros.inc ;; PUSH_PROBE_FRAME macro threadReg, trashReg, BITMASK + push_vol_reg r9 ; save R9, it might contain an objectref + push_vol_reg r8 ; save R8, it might contain an objectref + push_vol_reg rdx ; save RDX, it might contain an objectref push_vol_reg rcx ; save RCX, it might contain an objectref (async continuation) push_vol_reg rax ; save RAX, it might contain an objectref - lea trashReg, [rsp + 18h] + lea trashReg, [rsp + 30h] push_vol_reg trashReg ; save caller's RSP push_nonvol_reg r15 ; save preserved registers push_nonvol_reg r14 ; .. @@ -33,15 +37,18 @@ PUSH_PROBE_FRAME macro threadReg, trashReg, BITMASK push_vol_reg BITMASK ; save the register bitmask passed in by caller push_vol_reg threadReg ; Thread * (unused by stackwalker) push_nonvol_reg rbp ; save caller's RBP - mov trashReg, [rsp + 13*8] ; Find the return address + mov trashReg, [rsp + 16*8] ; Find the return address push_vol_reg trashReg ; save m_RIP lea trashReg, [rsp + 0] ; trashReg == address of frame - ;; allocate scratch space and any required alignment - alloc_stack 20h + 10h + 8 + ;; allocate scratch space (20h home space + 40h for xmm0..xmm3) + alloc_stack 20h + 40h - ;; save xmm0 in case it's being used as a return value - movdqa [rsp + 20h], xmm0 + ;; save xmm argument registers in case they contain live values at the hijack point + movdqa [rsp + 20h + 00h], xmm0 + movdqa [rsp + 20h + 10h], xmm1 + movdqa [rsp + 20h + 20h], xmm2 + movdqa [rsp + 20h + 30h], xmm3 ;; link the frame into the Thread mov [threadReg + OFFSETOF__Thread__m_pDeferredTransitionFrame], trashReg @@ -53,8 +60,11 @@ endm ;; object refs or byrefs). ;; POP_PROBE_FRAME macro - movdqa xmm0, [rsp + 20h] - add rsp, 20h + 10h + 8 + 8 ; deallocate stack and discard saved m_RIP + movdqa xmm0, [rsp + 20h + 00h] + movdqa xmm1, [rsp + 20h + 10h] + movdqa xmm2, [rsp + 20h + 20h] + movdqa xmm3, [rsp + 20h + 30h] + add rsp, 20h + 40h + 8 ; deallocate scratch space and discard saved m_RIP pop rbp pop rax ; discard Thread* pop rax ; discard BITMASK @@ -68,6 +78,9 @@ POP_PROBE_FRAME macro pop rax ; discard caller RSP pop rax pop rcx + pop rdx + pop r8 + pop r9 endm ;; @@ -78,21 +91,21 @@ endm ;; All registers correct for return to the original return address. ;; ;; Register state on exit: -;; RDX: thread pointer -;; RAX/RCX: preserved, other volatile regs trashed +;; R10: thread pointer +;; RAX/RCX/RDX/R8/R9: preserved, R11 trashed ;; FixupHijackedCallstack macro - ;; rdx <- GetThread(), TRASHES r8 - INLINE_GETTHREAD rdx, r8 + ;; r10 <- GetThread(), TRASHES r11 + INLINE_GETTHREAD r10, r11 ;; Fix the stack by pushing the original return address - mov r8, [rdx + OFFSETOF__Thread__m_pvHijackedReturnAddress] - push r8 + mov r11, [r10 + OFFSETOF__Thread__m_pvHijackedReturnAddress] + push r11 ;; Clear hijack state - xor r8, r8 - mov [rdx + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r8 - mov [rdx + OFFSETOF__Thread__m_pvHijackedReturnAddress], r8 + xor r11, r11 + mov [r10 + OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation], r11 + mov [r10 + OFFSETOF__Thread__m_pvHijackedReturnAddress], r11 endm ;; @@ -106,15 +119,15 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT jnz @f ret @@: - mov r8d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_THREAD_HIJACK + mov r11d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 + PTFF_THREAD_HIJACK jmp RhpWaitForGC NESTED_END RhpGcProbeHijack, _TEXT NESTED_ENTRY RhpWaitForGC, _TEXT - PUSH_PROBE_FRAME rdx, rax, r8 + PUSH_PROBE_FRAME r10, rax, r11 END_PROLOGUE - mov rbx, rdx + mov rbx, r10 mov rcx, [rbx + OFFSETOF__Thread__m_pDeferredTransitionFrame] call RhpWaitForGC2 @@ -147,7 +160,7 @@ ifdef FEATURE_GC_STRESS ;; LEAF_ENTRY RhpGcStressHijack, _TEXT FixupHijackedCallstack - mov r8d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + mov r11d, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_RAX + PTFF_SAVE_RCX + PTFF_SAVE_RDX + PTFF_SAVE_R8 + PTFF_SAVE_R9 jmp RhpGcStressProbe LEAF_END RhpGcStressHijack, _TEXT @@ -157,15 +170,15 @@ LEAF_END RhpGcStressHijack, _TEXT ;; This worker performs the GC Stress work and returns to the original return address. ;; ;; Register state on entry: -;; RDX: thread pointer -;; R8: register bitmask +;; R10: thread pointer +;; R11: register bitmask ;; ;; Register state on exit: -;; Scratch registers, except for RAX/RCX, have been trashed +;; Scratch registers, except for RAX/RCX/RDX/R8/R9, have been trashed ;; All other registers restored as they were when the hijack was first reached. ;; NESTED_ENTRY RhpGcStressProbe, _TEXT - PUSH_PROBE_FRAME rdx, rax, r8 + PUSH_PROBE_FRAME r10, rax, r11 END_PROLOGUE call RhpStressGc diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S index 093a3ec84ecfbc..c327e304b8e290 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S @@ -4,13 +4,14 @@ #include #include "AsmOffsets.inc" -#define PROBE_FRAME_SIZE 0xD0 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + - // 10 * 8 for callee saved registers + - // 1 * 8 for caller SP + - // 3 * 8 for int returns (x0, x1, x2) + - // 4 * 16 for FP/HFA/HVA returns - -// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers +#define PROBE_FRAME_SIZE 0x140 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + + // 10 * 8 for callee saved registers + + // 1 * 8 for caller SP + + // 8 * 8 for int return/argument regs (x0..x7) + + // 1 * 8 for padding (16-byte alignment for q regs) + + // 8 * 16 for FP argument registers (q0..q7) + +// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return/argument registers // and accepts the register bitmask // Call this macro first in the method (no further prolog instructions can be added after this). // @@ -37,13 +38,17 @@ // Slot at [sp, #0x70] is reserved for caller sp - // Save the integer return registers, x2 might contain an objectref (async continuation) + // Save the integer return/argument registers stp x0, x1, [sp, #0x78] - str x2, [sp, #0x88] + stp x2, x3, [sp, #0x88] + stp x4, x5, [sp, #0x98] + stp x6, x7, [sp, #0xA8] - // Save the FP/HFA/HVA return registers - stp q0, q1, [sp, #0x90] - stp q2, q3, [sp, #0xB0] + // Save the FP argument registers + stp q0, q1, [sp, #0xC0] + stp q2, q3, [sp, #0xE0] + stp q4, q5, [sp, #0x100] + stp q6, q7, [sp, #0x120] // Perform the rest of the PInvokeTransitionFrame initialization. // str \threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] // Thread * (unused by stackwalker) @@ -65,13 +70,17 @@ // .macro POP_PROBE_FRAME - // Restore the integer return registers + // Restore the integer return/argument registers ldp x0, x1, [sp, #0x78] - ldr x2, [sp, #0x88] + ldp x2, x3, [sp, #0x88] + ldp x4, x5, [sp, #0x98] + ldp x6, x7, [sp, #0xA8] - // Restore the FP/HFA/HVA return registers - ldp q0, q1, [sp, #0x90] - ldp q2, q3, [sp, #0xB0] + // Restore the FP argument registers + ldp q0, q1, [sp, #0xC0] + ldp q2, q3, [sp, #0xE0] + ldp q4, q5, [sp, #0x100] + ldp q6, q7, [sp, #0x120] // Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, 0x20 @@ -91,32 +100,65 @@ // All registers correct for return to the original return address. // // Register state on exit: -// x4: thread pointer -// x0, x1, x2: preserved +// x9: thread pointer +// x0-x7, q0-q7 preserved, x10 trashed // .macro FixupHijackedCallstack - // Store thread pointer temporarily in x3, then move it to x4 - // x4 <- GetThread() + // x9 <- GetThread() #ifdef FEATURE_EMULATED_TLS - GETTHREAD_ETLS_3 + // Save x0-x7 and q0-q7 around RhpGetThread call (it trashes all scratch registers) + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -0xD0 + stp x0, x1, [sp, #0x10] + stp x2, x3, [sp, #0x20] + stp x4, x5, [sp, #0x30] + stp x6, x7, [sp, #0x40] + stp q0, q1, [sp, #0x50] + stp q2, q3, [sp, #0x70] + stp q4, q5, [sp, #0x90] + stp q6, q7, [sp, #0xB0] + + bl C_FUNC(RhpGetThread) + mov x9, x0 + + ldp q6, q7, [sp, #0xB0] + ldp q4, q5, [sp, #0x90] + ldp q2, q3, [sp, #0x70] + ldp q0, q1, [sp, #0x50] + ldp x0, x1, [sp, #0x10] + ldp x2, x3, [sp, #0x20] + ldp x4, x5, [sp, #0x30] + ldp x6, x7, [sp, #0x40] + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 0xD0 #else - INLINE_GETTHREAD x3 + // INLINE_GETTHREAD makes blr calls that trash volatile FP registers. + // Save and restore q0-q7 around the call. + sub sp, sp, #0x80 + stp q0, q1, [sp, #0x00] + stp q2, q3, [sp, #0x20] + stp q4, q5, [sp, #0x40] + stp q6, q7, [sp, #0x60] + + INLINE_GETTHREAD x9 + + ldp q6, q7, [sp, #0x60] + ldp q4, q5, [sp, #0x40] + ldp q2, q3, [sp, #0x20] + ldp q0, q1, [sp, #0x00] + add sp, sp, #0x80 #endif - mov x4, x3 - // // Fix the stack by restoring the original return address // - ldr lr, [x4, #OFFSETOF__Thread__m_pvHijackedReturnAddress] + ldr lr, [x9, #OFFSETOF__Thread__m_pvHijackedReturnAddress] // // Clear hijack state // // Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress - stp xzr, xzr, [x4, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] + stp xzr, xzr, [x9, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] .endm // @@ -125,12 +167,12 @@ NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler FixupHijackedCallstack - PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 3 - tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, LOCAL_LABEL(WaitForGC) + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 10 + tbnz x10, #TrapThreadsFlags_TrapThreads_Bit, LOCAL_LABEL(WaitForGC) ret LOCAL_LABEL(WaitForGC): - mov x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + mov x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + PTFF_SAVE_X3 + PTFF_SAVE_X4 + PTFF_SAVE_X5 + PTFF_SAVE_X6 + PTFF_SAVE_X7 movk x12, PTFF_THREAD_HIJACK_HI, lsl #32 b C_FUNC(RhpWaitForGC) NESTED_END RhpGcProbeHijack @@ -138,9 +180,9 @@ NESTED_END RhpGcProbeHijack .global C_FUNC(RhpThrowHwEx) NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler - PUSH_PROBE_FRAME x4, x3, x12 + PUSH_PROBE_FRAME x9, x10, x12 - ldr x0, [x4, #OFFSETOF__Thread__m_pDeferredTransitionFrame] + ldr x0, [x9, #OFFSETOF__Thread__m_pDeferredTransitionFrame] bl C_FUNC(RhpWaitForGC2) POP_PROBE_FRAME diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm index 0ebcfe1c675d22..bd44842a8a8be6 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm @@ -14,11 +14,12 @@ field OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs field 10 * 8 ; x19..x28 m_CallersSP field 8 ; SP at routine entry - field 3 * 8 ; x0..x2 - field 4 * 16; q0..q3 + field 8 * 8 ; x0..x7 + field 8 ; padding for 16-byte alignment of q regs + field 8 * 16; q0..q7 PROBE_FRAME_SIZE field 0 - ;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers + ;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return/argument registers ;; and accepts the register bitmask ;; Call this macro first in the method (no further prolog instructions can be added after this). ;; @@ -46,13 +47,17 @@ PROBE_FRAME_SIZE field 0 ;; Slot at [sp, #0x70] is reserved for caller sp - ;; Save the integer return registers, x2 might contain an objectref (async continuation) + ;; Save the integer return/argument registers PROLOG_NOP stp x0, x1, [sp, #0x78] - PROLOG_NOP str x2, [sp, #0x88] + PROLOG_NOP stp x2, x3, [sp, #0x88] + PROLOG_NOP stp x4, x5, [sp, #0x98] + PROLOG_NOP stp x6, x7, [sp, #0xA8] - ;; Save the FP/HFA/HVA return registers - PROLOG_NOP stp q0, q1, [sp, #0x90] - PROLOG_NOP stp q2, q3, [sp, #0xB0] + ;; Save the FP argument registers + PROLOG_NOP stp q0, q1, [sp, #0xC0] + PROLOG_NOP stp q2, q3, [sp, #0xE0] + PROLOG_NOP stp q4, q5, [sp, #0x100] + PROLOG_NOP stp q6, q7, [sp, #0x120] ;; Perform the rest of the PInvokeTransitionFrame initialization. ;; str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker) @@ -76,13 +81,17 @@ PROBE_FRAME_SIZE field 0 MACRO POP_PROBE_FRAME - ;; Restore the integer return registers + ;; Restore the integer return/argument registers PROLOG_NOP ldp x0, x1, [sp, #0x78] - PROLOG_NOP ldr x2, [sp, #0x88] + PROLOG_NOP ldp x2, x3, [sp, #0x88] + PROLOG_NOP ldp x4, x5, [sp, #0x98] + PROLOG_NOP ldp x6, x7, [sp, #0xA8] - ; Restore the FP/HFA/HVA return registers - EPILOG_NOP ldp q0, q1, [sp, #0x90] - EPILOG_NOP ldp q2, q3, [sp, #0xB0] + ; Restore the FP argument registers + EPILOG_NOP ldp q0, q1, [sp, #0xC0] + EPILOG_NOP ldp q2, q3, [sp, #0xE0] + EPILOG_NOP ldp q4, q5, [sp, #0x100] + EPILOG_NOP ldp q6, q7, [sp, #0x120] ;; Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, #0x20 @@ -102,26 +111,26 @@ PROBE_FRAME_SIZE field 0 ;; All registers correct for return to the original return address. ;; ;; Register state on exit: -;; x4: thread pointer -;; x3: trashed +;; x9: thread pointer +;; x0-x7 preserved, x10 trashed ;; MACRO FixupHijackedCallstack - ;; x4 <- GetThread(), TRASHES x3 - INLINE_GETTHREAD x4, x3 + ;; x9 <- GetThread(), TRASHES x10 + INLINE_GETTHREAD x9, x10 ;; ;; Fix the stack by restoring the original return address ;; - ldr lr, [x4, #OFFSETOF__Thread__m_pvHijackedReturnAddress] + ldr lr, [x9, #OFFSETOF__Thread__m_pvHijackedReturnAddress] ;; ;; Clear hijack state ;; ASSERT OFFSETOF__Thread__m_pvHijackedReturnAddress == (OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation + 8) ;; Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress - stp xzr, xzr, [x4, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] + stp xzr, xzr, [x9, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] MEND MACRO @@ -147,13 +156,13 @@ PROBE_FRAME_SIZE field 0 LABELED_RETURN_ADDRESS RhpGcProbeHijack FixupHijackedCallstack - ldr x3, =RhpTrapThreads - ldr w3, [x3] - tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC + ldr x10, =RhpTrapThreads + ldr w10, [x10] + tbnz x10, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC ret WaitForGC - mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2) + mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + PTFF_SAVE_X3 + PTFF_SAVE_X4 + PTFF_SAVE_X5 + PTFF_SAVE_X6 + PTFF_SAVE_X7) movk x12, #PTFF_THREAD_HIJACK_HI, lsl #32 b RhpWaitForGC NESTED_END RhpGcProbeHijackWrapper @@ -161,9 +170,9 @@ WaitForGC EXTERN RhpThrowHwEx NESTED_ENTRY RhpWaitForGC - PUSH_PROBE_FRAME x4, x3, x12 + PUSH_PROBE_FRAME x9, x10, x12 - ldr x0, [x4, #OFFSETOF__Thread__m_pDeferredTransitionFrame] + ldr x0, [x9, #OFFSETOF__Thread__m_pDeferredTransitionFrame] bl RhpWaitForGC2 POP_PROBE_FRAME @@ -193,7 +202,7 @@ WaitForGC ;; LEAF_ENTRY RhpGcStressHijack FixupHijackedCallstack - mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2) + mov x12, #(DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + PTFF_SAVE_X2 + PTFF_SAVE_X3 + PTFF_SAVE_X4 + PTFF_SAVE_X5 + PTFF_SAVE_X6 + PTFF_SAVE_X7) b RhpGcStressProbe LEAF_END RhpGcStressHijack ;; @@ -202,18 +211,16 @@ WaitForGC ;; This worker performs the GC Stress work and returns to the original return address. ;; ;; Register state on entry: -;; x0: hijacked function return value -;; x1: hijacked function return value -;; x2: hijacked function async continuation value -;; x4: thread pointer +;; x0-x7: hijacked function return/argument values +;; x9: thread pointer ;; w12: register bitmask ;; ;; Register state on exit: -;; Scratch registers, except for x0, x1, x2, have been trashed +;; Scratch registers, except for x0-x7, have been trashed ;; All other registers restored as they were when the hijack was first reached. ;; NESTED_ENTRY RhpGcStressProbe - PUSH_PROBE_FRAME x4, x3, x12 + PUSH_PROBE_FRAME x9, x10, x12 bl RhpStressGc diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc index 7138bfc4326fea..a7cc2943ebffa9 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosamd64.inc @@ -242,9 +242,13 @@ C_FUNC(\Name): #define PTFF_SAVE_R15 0x00000080 #define PTFF_SAVE_ALL_PRESERVED 0x000000F1 // NOTE: RBP is not included in this set! #define PTFF_SAVE_RSP 0x00008000 +#define PTFF_SAVE_RSI 0x00000002 // RSI is saved in hijack handler - in case it contains a GC ref +#define PTFF_SAVE_RDI 0x00000004 // RDI is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RAX 0x00000100 // RAX is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RCX 0x00000200 // RCX is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_RDX 0x00000400 // RDX is saved in hijack handler - in case it contains a GC ref +#define PTFF_SAVE_R8 0x00000800 // R8 is saved in hijack handler - in case it contains a GC ref +#define PTFF_SAVE_R9 0x00001000 // R9 is saved in hijack handler - in case it contains a GC ref #define PTFF_SAVE_ALL_SCRATCH 0x00007F00 #define PTFF_THREAD_HIJACK 0x00100000 // indicates that this is a frame for a hijacked call diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc index 5bf15b8e8f6a79..3027a17d6c8fc1 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc @@ -331,6 +331,11 @@ C_FUNC(\Name): #define PTFF_SAVE_X0 0x00000800 #define PTFF_SAVE_X1 0x00001000 #define PTFF_SAVE_X2 0x00002000 +#define PTFF_SAVE_X3 0x00004000 +#define PTFF_SAVE_X4 0x00008000 +#define PTFF_SAVE_X5 0x00010000 +#define PTFF_SAVE_X6 0x00020000 +#define PTFF_SAVE_X7 0x00040000 #define PTFF_SAVE_ALL_PRESERVED 0x000003FF // NOTE: x19-x28 #define PTFF_THREAD_HIJACK_HI 0x00000001 // upper 32 bits of the PTFF_THREAD_HIJACK