[RHEL7,COMMIT] x86_64: fix crashes due to bogus iret traps handling #PSBM-107794

Submitted by Vasily Averin on Sept. 22, 2020, 7:32 a.m.

Details

Message ID 202009220732.08M7WNTN002994@vz7build.vvs.sw.ru
State New
Series "x86_64: fix crashes due to bogus iret traps handling #PSBM-107794"
Headers show

Commit Message

Vasily Averin Sept. 22, 2020, 7:32 a.m.
The commit is pushed to "branch-rh7-3.10.0-1127.18.2.vz7.163.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.18.2.vz7.163.27
------>
commit 32457eef8a6680864624049df7ebdbcf53676a93
Author: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date:   Tue Sep 22 10:32:22 2020 +0300

    x86_64: fix crashes due to bogus iret traps handling #PSBM-107794
    
    Our handling of bad irets seems to be broken since meltdown fix.
    When interrupt return to userspace fails we running with user CR3
    thus faulting in error_sti on access to 'kernel_stack' variable.
    This continues with series of faults in page fault handler until
    we run out of stack and end up with:
    
    PANIC: double fault, error_code: 0x0
    RIP: 0010:[<ffffffff9f1c278d>]  [<ffffffff9f1c278d>] async_page_fault+0xd/0x30
    Call Trace:
    <IRQ>
     ? smp_apic_timer_interrupt+0x48/0x60
     ? apic_timer_interrupt+0x16a/0x170
    <EOI>
     ? bad_area+0x49/0x50
     ? __do_page_fault+0x477/0x500
     ? trace_do_page_fault+0x56/0x150
     ? do_async_page_fault+0x22/0xf0
     ? async_page_fault+0x28/0x30
     ? .E_write_words+0x5c/0x641
     ? putname+0x3d/0x60
     ? timerqueue_add+0x60/0xb0
     ? enqueue_hrtimer+0x25/0x80
     ? hrtimer_start_range_ns+0x1fd/0x3c0
     ? recalc_sigpending+0x1b/0x70
     ? __set_task_blocked+0x41/0xa0
     ? restore_altstack+0x18/0x30
     ? sys_rt_sigreturn+0xe8/0x100
     ? stub_rt_sigreturn+0x48/0x90
    
    Backport the fix for this from RHEL 7.9 beta
    
    https://jira.sw.ru/browse/PSBM-107794
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 arch/x86/kernel/entry_64.S | 49 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 13 deletions(-)

Patch hide | download patch | download mbox

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3e67d18..91e5503 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -949,12 +949,42 @@  irq_return:
 	 * when returning from IPI handler.
 	 */
 	INTERRUPT_RETURN
+	_ASM_EXTABLE(irq_return, bad_iret)
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iretq
+	_ASM_EXTABLE(native_iret, bad_iret)
 #endif
 
+	.section .fixup,"ax"
+bad_iret:
+	/*
+	 * The iret traps when the %cs or %ss being restored is bogus.
+	 * We've lost the original trap vector and error code.
+	 * #GPF is the most likely one to get for an invalid selector.
+	 * So pretend we completed the iret and took the #GPF in user mode.
+	 *
+	 * We are now running with the kernel GS after exception recovery.
+	 * But error_entry expects us to have user GS to match the user %cs,
+	 * so swap back.
+	 */
+	pushq $0
+
+	/*
+	 * If a kernel bug clears user CS bit and in turn we'll skip SWAPGS in
+	 * general_protection, skip the SWAPGS here as well so we won't hard reboot.
+	 * This increases robustness of bad_iret to kernel bugs as well.
+	 */
+	testl $3, 8*2(%rsp)
+	je 1f
+	SWAPGS
+1:
+
+	jmp general_protection
+
+	.previous
+
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
@@ -1550,15 +1580,16 @@  error_sti:
 
 /*
  * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here.  B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
  */
 error_kernelspace:
 	incl %ebx
 	leaq irq_return(%rip),%rcx
 	cmpq %rcx,RIP+8(%rsp)
-	je error_bad_iret
+	je error_swapgs
 	movl %ecx,%eax	/* zero extend */
 	cmpq %rax,RIP+8(%rsp)
 	je bstep_iret
@@ -1570,15 +1601,7 @@  error_kernelspace:
 bstep_iret:
 	/* Fix truncated RIP */
 	movq %rcx,RIP+8(%rsp)
-	/* fall through */
-
-error_bad_iret:
-	SWAPGS
-	mov %rsp,%rdi
-	call fixup_bad_iret
-	mov %rax,%rsp
-	decl %ebx	/* Return to usergs */
-	jmp error_sti
+	jmp error_swapgs
 	CFI_ENDPROC
 END(error_entry)