1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Compatibility mode system call entry point for x86-64. 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright 2000-2002 Andi Kleen, SuSE Labs. 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun#include <asm/asm-offsets.h> 8*4882a593Smuzhiyun#include <asm/current.h> 9*4882a593Smuzhiyun#include <asm/errno.h> 10*4882a593Smuzhiyun#include <asm/ia32_unistd.h> 11*4882a593Smuzhiyun#include <asm/thread_info.h> 12*4882a593Smuzhiyun#include <asm/segment.h> 13*4882a593Smuzhiyun#include <asm/irqflags.h> 14*4882a593Smuzhiyun#include <asm/asm.h> 15*4882a593Smuzhiyun#include <asm/smap.h> 16*4882a593Smuzhiyun#include <asm/nospec-branch.h> 17*4882a593Smuzhiyun#include <linux/linkage.h> 18*4882a593Smuzhiyun#include <linux/err.h> 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun#include "calling.h" 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun .section .entry.text, "ax" 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun/* 25*4882a593Smuzhiyun * 32-bit SYSENTER entry. 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * 32-bit system calls through the vDSO's __kernel_vsyscall enter here 28*4882a593Smuzhiyun * on 64-bit kernels running on Intel CPUs. 29*4882a593Smuzhiyun * 30*4882a593Smuzhiyun * The SYSENTER instruction, in principle, should *only* occur in the 31*4882a593Smuzhiyun * vDSO. In practice, a small number of Android devices were shipped 32*4882a593Smuzhiyun * with a copy of Bionic that inlined a SYSENTER instruction. This 33*4882a593Smuzhiyun * never happened in any of Google's Bionic versions -- it only happened 34*4882a593Smuzhiyun * in a narrow range of Intel-provided versions. 35*4882a593Smuzhiyun * 36*4882a593Smuzhiyun * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. 37*4882a593Smuzhiyun * IF and VM in RFLAGS are cleared (IOW: interrupts are off). 38*4882a593Smuzhiyun * SYSENTER does not save anything on the stack, 39*4882a593Smuzhiyun * and does not save old RIP (!!!), RSP, or RFLAGS. 40*4882a593Smuzhiyun * 41*4882a593Smuzhiyun * Arguments: 42*4882a593Smuzhiyun * eax system call number 43*4882a593Smuzhiyun * ebx arg1 44*4882a593Smuzhiyun * ecx arg2 45*4882a593Smuzhiyun * edx arg3 46*4882a593Smuzhiyun * esi arg4 47*4882a593Smuzhiyun * edi arg5 48*4882a593Smuzhiyun * ebp user stack 49*4882a593Smuzhiyun * 0(%ebp) arg6 50*4882a593Smuzhiyun */ 51*4882a593SmuzhiyunSYM_CODE_START(entry_SYSENTER_compat) 52*4882a593Smuzhiyun UNWIND_HINT_ENTRY 53*4882a593Smuzhiyun /* Interrupts are off on entry. */ 54*4882a593Smuzhiyun SWAPGS 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun pushq %rax 57*4882a593Smuzhiyun SWITCH_TO_KERNEL_CR3 scratch_reg=%rax 58*4882a593Smuzhiyun popq %rax 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun /* Construct struct pt_regs on stack */ 63*4882a593Smuzhiyun pushq $__USER32_DS /* pt_regs->ss */ 64*4882a593Smuzhiyun pushq $0 /* pt_regs->sp = 0 (placeholder) */ 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun /* 67*4882a593Smuzhiyun * Push flags. This is nasty. First, interrupts are currently 68*4882a593Smuzhiyun * off, but we need pt_regs->flags to have IF set. Second, if TS 69*4882a593Smuzhiyun * was set in usermode, it's still set, and we're singlestepping 70*4882a593Smuzhiyun * through this code. do_SYSENTER_32() will fix up IF. 71*4882a593Smuzhiyun */ 72*4882a593Smuzhiyun pushfq /* pt_regs->flags (except IF = 0) */ 73*4882a593Smuzhiyun pushq $__USER32_CS /* pt_regs->cs */ 74*4882a593Smuzhiyun pushq $0 /* pt_regs->ip = 0 (placeholder) */ 75*4882a593SmuzhiyunSYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun /* 78*4882a593Smuzhiyun * User tracing code (ptrace or signal handlers) might assume that 79*4882a593Smuzhiyun * the saved RAX contains a 32-bit number when we're invoking a 32-bit 80*4882a593Smuzhiyun * syscall. Just in case the high bits are nonzero, zero-extend 81*4882a593Smuzhiyun * the syscall number. (This could almost certainly be deleted 82*4882a593Smuzhiyun * with no ill effects.) 83*4882a593Smuzhiyun */ 84*4882a593Smuzhiyun movl %eax, %eax 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun pushq %rax /* pt_regs->orig_ax */ 87*4882a593Smuzhiyun pushq %rdi /* pt_regs->di */ 88*4882a593Smuzhiyun pushq %rsi /* pt_regs->si */ 89*4882a593Smuzhiyun pushq %rdx /* pt_regs->dx */ 90*4882a593Smuzhiyun pushq %rcx /* pt_regs->cx */ 91*4882a593Smuzhiyun pushq $-ENOSYS /* pt_regs->ax */ 92*4882a593Smuzhiyun pushq $0 /* pt_regs->r8 = 0 */ 93*4882a593Smuzhiyun xorl %r8d, %r8d /* nospec r8 */ 94*4882a593Smuzhiyun pushq $0 /* pt_regs->r9 = 0 */ 95*4882a593Smuzhiyun xorl %r9d, %r9d /* nospec r9 */ 96*4882a593Smuzhiyun pushq $0 /* pt_regs->r10 = 0 */ 97*4882a593Smuzhiyun xorl %r10d, %r10d /* nospec r10 */ 98*4882a593Smuzhiyun pushq $0 /* pt_regs->r11 = 0 */ 99*4882a593Smuzhiyun xorl %r11d, %r11d /* nospec r11 */ 100*4882a593Smuzhiyun pushq %rbx /* pt_regs->rbx */ 101*4882a593Smuzhiyun xorl %ebx, %ebx /* nospec rbx */ 102*4882a593Smuzhiyun pushq %rbp /* pt_regs->rbp (will be overwritten) */ 103*4882a593Smuzhiyun xorl %ebp, %ebp /* nospec rbp */ 104*4882a593Smuzhiyun pushq $0 /* pt_regs->r12 = 0 */ 105*4882a593Smuzhiyun xorl %r12d, %r12d /* nospec r12 */ 106*4882a593Smuzhiyun pushq $0 /* pt_regs->r13 = 0 */ 107*4882a593Smuzhiyun xorl %r13d, %r13d /* nospec r13 */ 108*4882a593Smuzhiyun pushq $0 /* pt_regs->r14 = 0 */ 109*4882a593Smuzhiyun xorl %r14d, %r14d /* nospec r14 */ 110*4882a593Smuzhiyun pushq $0 /* pt_regs->r15 = 0 */ 111*4882a593Smuzhiyun xorl %r15d, %r15d /* nospec r15 */ 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun UNWIND_HINT_REGS 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun cld 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun IBRS_ENTER 118*4882a593Smuzhiyun UNTRAIN_RET 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun /* 121*4882a593Smuzhiyun * SYSENTER doesn't filter flags, so we need to clear NT and AC 122*4882a593Smuzhiyun * ourselves. To save a few cycles, we can check whether 123*4882a593Smuzhiyun * either was set instead of doing an unconditional popfq. 124*4882a593Smuzhiyun * This needs to happen before enabling interrupts so that 125*4882a593Smuzhiyun * we don't get preempted with NT set. 126*4882a593Smuzhiyun * 127*4882a593Smuzhiyun * If TF is set, we will single-step all the way to here -- do_debug 128*4882a593Smuzhiyun * will ignore all the traps. (Yes, this is slow, but so is 129*4882a593Smuzhiyun * single-stepping in general. This allows us to avoid having 130*4882a593Smuzhiyun * a more complicated code to handle the case where a user program 131*4882a593Smuzhiyun * forces us to single-step through the SYSENTER entry code.) 132*4882a593Smuzhiyun * 133*4882a593Smuzhiyun * NB.: .Lsysenter_fix_flags is a label with the code under it moved 134*4882a593Smuzhiyun * out-of-line as an optimization: NT is unlikely to be set in the 135*4882a593Smuzhiyun * majority of the cases and instead of polluting the I$ unnecessarily, 136*4882a593Smuzhiyun * we're keeping that code behind a branch which will predict as 137*4882a593Smuzhiyun * not-taken and therefore its instructions won't be fetched. 138*4882a593Smuzhiyun */ 139*4882a593Smuzhiyun testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) 140*4882a593Smuzhiyun jnz .Lsysenter_fix_flags 141*4882a593Smuzhiyun.Lsysenter_flags_fixed: 142*4882a593Smuzhiyun 143*4882a593Smuzhiyun movq %rsp, %rdi 144*4882a593Smuzhiyun call do_SYSENTER_32 145*4882a593Smuzhiyun /* XEN PV guests always use IRET path */ 146*4882a593Smuzhiyun ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ 147*4882a593Smuzhiyun "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV 148*4882a593Smuzhiyun jmp sysret32_from_system_call 149*4882a593Smuzhiyun 150*4882a593Smuzhiyun.Lsysenter_fix_flags: 151*4882a593Smuzhiyun pushq $X86_EFLAGS_FIXED 152*4882a593Smuzhiyun popfq 153*4882a593Smuzhiyun jmp .Lsysenter_flags_fixed 154*4882a593SmuzhiyunSYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) 155*4882a593SmuzhiyunSYM_CODE_END(entry_SYSENTER_compat) 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun/* 158*4882a593Smuzhiyun * 32-bit SYSCALL entry. 159*4882a593Smuzhiyun * 160*4882a593Smuzhiyun * 32-bit system calls through the vDSO's __kernel_vsyscall enter here 161*4882a593Smuzhiyun * on 64-bit kernels running on AMD CPUs. 162*4882a593Smuzhiyun * 163*4882a593Smuzhiyun * The SYSCALL instruction, in principle, should *only* occur in the 164*4882a593Smuzhiyun * vDSO. In practice, it appears that this really is the case. 165*4882a593Smuzhiyun * As evidence: 166*4882a593Smuzhiyun * 167*4882a593Smuzhiyun * - The calling convention for SYSCALL has changed several times without 168*4882a593Smuzhiyun * anyone noticing. 169*4882a593Smuzhiyun * 170*4882a593Smuzhiyun * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything 171*4882a593Smuzhiyun * user task that did SYSCALL without immediately reloading SS 172*4882a593Smuzhiyun * would randomly crash. 173*4882a593Smuzhiyun * 174*4882a593Smuzhiyun * - Most programmers do not directly target AMD CPUs, and the 32-bit 175*4882a593Smuzhiyun * SYSCALL instruction does not exist on Intel CPUs. Even on AMD 176*4882a593Smuzhiyun * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels 177*4882a593Smuzhiyun * because the SYSCALL instruction in legacy/native 32-bit mode (as 178*4882a593Smuzhiyun * opposed to compat mode) is sufficiently poorly designed as to be 179*4882a593Smuzhiyun * essentially unusable. 180*4882a593Smuzhiyun * 181*4882a593Smuzhiyun * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves 182*4882a593Smuzhiyun * RFLAGS to R11, then loads new SS, CS, and RIP from previously 183*4882a593Smuzhiyun * programmed MSRs. RFLAGS gets masked by a value from another MSR 184*4882a593Smuzhiyun * (so CLD and CLAC are not needed). SYSCALL does not save anything on 185*4882a593Smuzhiyun * the stack and does not change RSP. 186*4882a593Smuzhiyun * 187*4882a593Smuzhiyun * Note: RFLAGS saving+masking-with-MSR happens only in Long mode 188*4882a593Smuzhiyun * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 189*4882a593Smuzhiyun * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit 190*4882a593Smuzhiyun * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 191*4882a593Smuzhiyun * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 192*4882a593Smuzhiyun * 193*4882a593Smuzhiyun * Arguments: 194*4882a593Smuzhiyun * eax system call number 195*4882a593Smuzhiyun * ecx return address 196*4882a593Smuzhiyun * ebx arg1 197*4882a593Smuzhiyun * ebp arg2 (note: not saved in the stack frame, should not be touched) 198*4882a593Smuzhiyun * edx arg3 199*4882a593Smuzhiyun * esi arg4 200*4882a593Smuzhiyun * edi arg5 201*4882a593Smuzhiyun * esp user stack 202*4882a593Smuzhiyun * 0(%esp) arg6 203*4882a593Smuzhiyun */ 204*4882a593SmuzhiyunSYM_CODE_START(entry_SYSCALL_compat) 205*4882a593Smuzhiyun UNWIND_HINT_ENTRY 206*4882a593Smuzhiyun /* Interrupts are off on entry. */ 207*4882a593Smuzhiyun swapgs 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun /* Stash user ESP */ 210*4882a593Smuzhiyun movl %esp, %r8d 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun /* Use %rsp as scratch reg. User ESP is stashed in r8 */ 213*4882a593Smuzhiyun SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun /* Switch to the kernel stack */ 216*4882a593Smuzhiyun movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 217*4882a593Smuzhiyun 218*4882a593SmuzhiyunSYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun /* Construct struct pt_regs on stack */ 221*4882a593Smuzhiyun pushq $__USER32_DS /* pt_regs->ss */ 222*4882a593Smuzhiyun pushq %r8 /* pt_regs->sp */ 223*4882a593Smuzhiyun pushq %r11 /* pt_regs->flags */ 224*4882a593Smuzhiyun pushq $__USER32_CS /* pt_regs->cs */ 225*4882a593Smuzhiyun pushq %rcx /* pt_regs->ip */ 226*4882a593SmuzhiyunSYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) 227*4882a593Smuzhiyun movl %eax, %eax /* discard orig_ax high bits */ 228*4882a593Smuzhiyun pushq %rax /* pt_regs->orig_ax */ 229*4882a593Smuzhiyun pushq %rdi /* pt_regs->di */ 230*4882a593Smuzhiyun pushq %rsi /* pt_regs->si */ 231*4882a593Smuzhiyun xorl %esi, %esi /* nospec si */ 232*4882a593Smuzhiyun pushq %rdx /* pt_regs->dx */ 233*4882a593Smuzhiyun xorl %edx, %edx /* nospec dx */ 234*4882a593Smuzhiyun pushq %rbp /* pt_regs->cx (stashed in bp) */ 235*4882a593Smuzhiyun xorl %ecx, %ecx /* nospec cx */ 236*4882a593Smuzhiyun pushq $-ENOSYS /* pt_regs->ax */ 237*4882a593Smuzhiyun pushq $0 /* pt_regs->r8 = 0 */ 238*4882a593Smuzhiyun xorl %r8d, %r8d /* nospec r8 */ 239*4882a593Smuzhiyun pushq $0 /* pt_regs->r9 = 0 */ 240*4882a593Smuzhiyun xorl %r9d, %r9d /* nospec r9 */ 241*4882a593Smuzhiyun pushq $0 /* pt_regs->r10 = 0 */ 242*4882a593Smuzhiyun xorl %r10d, %r10d /* nospec r10 */ 243*4882a593Smuzhiyun pushq $0 /* pt_regs->r11 = 0 */ 244*4882a593Smuzhiyun xorl %r11d, %r11d /* nospec r11 */ 245*4882a593Smuzhiyun pushq %rbx /* pt_regs->rbx */ 246*4882a593Smuzhiyun xorl %ebx, %ebx /* nospec rbx */ 247*4882a593Smuzhiyun pushq %rbp /* pt_regs->rbp (will be overwritten) */ 248*4882a593Smuzhiyun xorl %ebp, %ebp /* nospec rbp */ 249*4882a593Smuzhiyun pushq $0 /* pt_regs->r12 = 0 */ 250*4882a593Smuzhiyun xorl %r12d, %r12d /* nospec r12 */ 251*4882a593Smuzhiyun pushq $0 /* pt_regs->r13 = 0 */ 252*4882a593Smuzhiyun xorl %r13d, %r13d /* nospec r13 */ 253*4882a593Smuzhiyun pushq $0 /* pt_regs->r14 = 0 */ 254*4882a593Smuzhiyun xorl %r14d, %r14d /* nospec r14 */ 255*4882a593Smuzhiyun pushq $0 /* pt_regs->r15 = 0 */ 256*4882a593Smuzhiyun xorl %r15d, %r15d /* nospec r15 */ 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun UNWIND_HINT_REGS 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun IBRS_ENTER 261*4882a593Smuzhiyun UNTRAIN_RET 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun movq %rsp, %rdi 264*4882a593Smuzhiyun call do_fast_syscall_32 265*4882a593Smuzhiyun /* XEN PV guests always use IRET path */ 266*4882a593Smuzhiyun ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ 267*4882a593Smuzhiyun "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun /* Opportunistic SYSRET */ 270*4882a593Smuzhiyunsysret32_from_system_call: 271*4882a593Smuzhiyun /* 272*4882a593Smuzhiyun * We are not going to return to userspace from the trampoline 273*4882a593Smuzhiyun * stack. So let's erase the thread stack right now. 274*4882a593Smuzhiyun */ 275*4882a593Smuzhiyun STACKLEAK_ERASE 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun IBRS_EXIT 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun movq RBX(%rsp), %rbx /* pt_regs->rbx */ 280*4882a593Smuzhiyun movq RBP(%rsp), %rbp /* pt_regs->rbp */ 281*4882a593Smuzhiyun movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ 282*4882a593Smuzhiyun movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ 283*4882a593Smuzhiyun addq $RAX, %rsp /* Skip r8-r15 */ 284*4882a593Smuzhiyun popq %rax /* pt_regs->rax */ 285*4882a593Smuzhiyun popq %rdx /* Skip pt_regs->cx */ 286*4882a593Smuzhiyun popq %rdx /* pt_regs->dx */ 287*4882a593Smuzhiyun popq %rsi /* pt_regs->si */ 288*4882a593Smuzhiyun popq %rdi /* pt_regs->di */ 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun /* 291*4882a593Smuzhiyun * USERGS_SYSRET32 does: 292*4882a593Smuzhiyun * GSBASE = user's GS base 293*4882a593Smuzhiyun * EIP = ECX 294*4882a593Smuzhiyun * RFLAGS = R11 295*4882a593Smuzhiyun * CS = __USER32_CS 296*4882a593Smuzhiyun * SS = __USER_DS 297*4882a593Smuzhiyun * 298*4882a593Smuzhiyun * ECX will not match pt_regs->cx, but we're returning to a vDSO 299*4882a593Smuzhiyun * trampoline that will fix up RCX, so this is okay. 300*4882a593Smuzhiyun * 301*4882a593Smuzhiyun * R12-R15 are callee-saved, so they contain whatever was in them 302*4882a593Smuzhiyun * when the system call started, which is already known to user 303*4882a593Smuzhiyun * code. We zero R8-R10 to avoid info leaks. 304*4882a593Smuzhiyun */ 305*4882a593Smuzhiyun movq RSP-ORIG_RAX(%rsp), %rsp 306*4882a593Smuzhiyun 307*4882a593Smuzhiyun /* 308*4882a593Smuzhiyun * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored 309*4882a593Smuzhiyun * on the process stack which is not mapped to userspace and 310*4882a593Smuzhiyun * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 311*4882a593Smuzhiyun * switch until after after the last reference to the process 312*4882a593Smuzhiyun * stack. 313*4882a593Smuzhiyun * 314*4882a593Smuzhiyun * %r8/%r9 are zeroed before the sysret, thus safe to clobber. 315*4882a593Smuzhiyun */ 316*4882a593Smuzhiyun SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 317*4882a593Smuzhiyun 318*4882a593Smuzhiyun xorl %r8d, %r8d 319*4882a593Smuzhiyun xorl %r9d, %r9d 320*4882a593Smuzhiyun xorl %r10d, %r10d 321*4882a593Smuzhiyun swapgs 322*4882a593Smuzhiyun sysretl 323*4882a593SmuzhiyunSYM_CODE_END(entry_SYSCALL_compat) 324*4882a593Smuzhiyun 325*4882a593Smuzhiyun/* 326*4882a593Smuzhiyun * 32-bit legacy system call entry. 327*4882a593Smuzhiyun * 328*4882a593Smuzhiyun * 32-bit x86 Linux system calls traditionally used the INT $0x80 329*4882a593Smuzhiyun * instruction. INT $0x80 lands here. 330*4882a593Smuzhiyun * 331*4882a593Smuzhiyun * This entry point can be used by 32-bit and 64-bit programs to perform 332*4882a593Smuzhiyun * 32-bit system calls. Instances of INT $0x80 can be found inline in 333*4882a593Smuzhiyun * various programs and libraries. It is also used by the vDSO's 334*4882a593Smuzhiyun * __kernel_vsyscall fallback for hardware that doesn't support a faster 335*4882a593Smuzhiyun * entry method. Restarted 32-bit system calls also fall back to INT 336*4882a593Smuzhiyun * $0x80 regardless of what instruction was originally used to do the 337*4882a593Smuzhiyun * system call. 338*4882a593Smuzhiyun * 339*4882a593Smuzhiyun * This is considered a slow path. It is not used by most libc 340*4882a593Smuzhiyun * implementations on modern hardware except during process startup. 341*4882a593Smuzhiyun * 342*4882a593Smuzhiyun * Arguments: 343*4882a593Smuzhiyun * eax system call number 344*4882a593Smuzhiyun * ebx arg1 345*4882a593Smuzhiyun * ecx arg2 346*4882a593Smuzhiyun * edx arg3 347*4882a593Smuzhiyun * esi arg4 348*4882a593Smuzhiyun * edi arg5 349*4882a593Smuzhiyun * ebp arg6 350*4882a593Smuzhiyun */ 351*4882a593SmuzhiyunSYM_CODE_START(entry_INT80_compat) 352*4882a593Smuzhiyun UNWIND_HINT_ENTRY 353*4882a593Smuzhiyun /* 354*4882a593Smuzhiyun * Interrupts are off on entry. 355*4882a593Smuzhiyun */ 356*4882a593Smuzhiyun ASM_CLAC /* Do this early to minimize exposure */ 357*4882a593Smuzhiyun SWAPGS 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun /* 360*4882a593Smuzhiyun * User tracing code (ptrace or signal handlers) might assume that 361*4882a593Smuzhiyun * the saved RAX contains a 32-bit number when we're invoking a 32-bit 362*4882a593Smuzhiyun * syscall. Just in case the high bits are nonzero, zero-extend 363*4882a593Smuzhiyun * the syscall number. (This could almost certainly be deleted 364*4882a593Smuzhiyun * with no ill effects.) 365*4882a593Smuzhiyun */ 366*4882a593Smuzhiyun movl %eax, %eax 367*4882a593Smuzhiyun 368*4882a593Smuzhiyun /* switch to thread stack expects orig_ax and rdi to be pushed */ 369*4882a593Smuzhiyun pushq %rax /* pt_regs->orig_ax */ 370*4882a593Smuzhiyun pushq %rdi /* pt_regs->di */ 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun /* Need to switch before accessing the thread stack. */ 373*4882a593Smuzhiyun SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun /* In the Xen PV case we already run on the thread stack. */ 376*4882a593Smuzhiyun ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun movq %rsp, %rdi 379*4882a593Smuzhiyun movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun pushq 6*8(%rdi) /* regs->ss */ 382*4882a593Smuzhiyun pushq 5*8(%rdi) /* regs->rsp */ 383*4882a593Smuzhiyun pushq 4*8(%rdi) /* regs->eflags */ 384*4882a593Smuzhiyun pushq 3*8(%rdi) /* regs->cs */ 385*4882a593Smuzhiyun pushq 2*8(%rdi) /* regs->ip */ 386*4882a593Smuzhiyun pushq 1*8(%rdi) /* regs->orig_ax */ 387*4882a593Smuzhiyun pushq (%rdi) /* pt_regs->di */ 388*4882a593Smuzhiyun.Lint80_keep_stack: 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun pushq %rsi /* pt_regs->si */ 391*4882a593Smuzhiyun xorl %esi, %esi /* nospec si */ 392*4882a593Smuzhiyun pushq %rdx /* pt_regs->dx */ 393*4882a593Smuzhiyun xorl %edx, %edx /* nospec dx */ 394*4882a593Smuzhiyun pushq %rcx /* pt_regs->cx */ 395*4882a593Smuzhiyun xorl %ecx, %ecx /* nospec cx */ 396*4882a593Smuzhiyun pushq $-ENOSYS /* pt_regs->ax */ 397*4882a593Smuzhiyun pushq %r8 /* pt_regs->r8 */ 398*4882a593Smuzhiyun xorl %r8d, %r8d /* nospec r8 */ 399*4882a593Smuzhiyun pushq %r9 /* pt_regs->r9 */ 400*4882a593Smuzhiyun xorl %r9d, %r9d /* nospec r9 */ 401*4882a593Smuzhiyun pushq %r10 /* pt_regs->r10*/ 402*4882a593Smuzhiyun xorl %r10d, %r10d /* nospec r10 */ 403*4882a593Smuzhiyun pushq %r11 /* pt_regs->r11 */ 404*4882a593Smuzhiyun xorl %r11d, %r11d /* nospec r11 */ 405*4882a593Smuzhiyun pushq %rbx /* pt_regs->rbx */ 406*4882a593Smuzhiyun xorl %ebx, %ebx /* nospec rbx */ 407*4882a593Smuzhiyun pushq %rbp /* pt_regs->rbp */ 408*4882a593Smuzhiyun xorl %ebp, %ebp /* nospec rbp */ 409*4882a593Smuzhiyun pushq %r12 /* pt_regs->r12 */ 410*4882a593Smuzhiyun xorl %r12d, %r12d /* nospec r12 */ 411*4882a593Smuzhiyun pushq %r13 /* pt_regs->r13 */ 412*4882a593Smuzhiyun xorl %r13d, %r13d /* nospec r13 */ 413*4882a593Smuzhiyun pushq %r14 /* pt_regs->r14 */ 414*4882a593Smuzhiyun xorl %r14d, %r14d /* nospec r14 */ 415*4882a593Smuzhiyun pushq %r15 /* pt_regs->r15 */ 416*4882a593Smuzhiyun xorl %r15d, %r15d /* nospec r15 */ 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun UNWIND_HINT_REGS 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun cld 421*4882a593Smuzhiyun 422*4882a593Smuzhiyun IBRS_ENTER 423*4882a593Smuzhiyun UNTRAIN_RET 424*4882a593Smuzhiyun 425*4882a593Smuzhiyun movq %rsp, %rdi 426*4882a593Smuzhiyun call do_int80_syscall_32 427*4882a593Smuzhiyun jmp swapgs_restore_regs_and_return_to_usermode 428*4882a593SmuzhiyunSYM_CODE_END(entry_INT80_compat) 429