[2/8] x86/kerndat: Add a check for ptrace() bug on Skylake

Submitted by Dmitry Safonov on Jan. 31, 2018, 4:58 p.m.

Details

Message ID 20180131165815.25516-3-dima@arista.com
State New
Series "x86/ptrace: Workaround Skylake bug"
Headers show

Commit Message

Dmitry Safonov Jan. 31, 2018, 4:58 p.m.
We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
returns xsave without FP state part.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
 criu/arch/x86/include/asm/restorer.h |  2 +
 criu/include/kerndat.h               |  1 +
 criu/kerndat.c                       | 18 +++++++++
 4 files changed, 92 insertions(+)

Patch hide | download patch | download mbox

diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
index d7b21e7c9bae..a389e563a886 100644
--- a/criu/arch/x86/crtools.c
+++ b/criu/arch/x86/crtools.c
@@ -7,6 +7,7 @@ 
 #include <sys/syscall.h>
 #include <sys/auxv.h>
 #include <sys/wait.h>
+#include <sys/ptrace.h>
 
 #include "types.h"
 #include "log.h"
@@ -30,6 +31,7 @@ 
 #include "images/core.pb-c.h"
 #include "images/creds.pb-c.h"
 
+/* XXX: Move all kerndat features to per-arch kerndat .c */
 int kdat_can_map_vdso(void)
 {
 	pid_t child;
@@ -188,6 +190,75 @@  int kdat_compatible_cr(void)
 }
 #endif
 
+/*
+ * Pre v4.14 kernels have a bug on Skylake CPUs:
+ * copyout_from_xsaves() creates fpu state for
+ *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
+ * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
+ * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
+ * XFEATURE_MASK_FP.
+ * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
+ * as mxcsr store part of the state.
+ */
+int kdat_x86_has_ptrace_fpu_xsave_bug(void)
+{
+	user_fpregs_struct_t xsave = { };
+	struct iovec iov;
+	int ret = -1;
+	pid_t child;
+	int stat;
+
+	/* OSXSAVE can't be changed during boot. */
+	if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
+		return 0;
+
+	child = fork();
+	if (child < 0) {
+		pr_perror("%s(): failed to fork()", __func__);
+		return -1;
+	}
+
+	if (child == 0) {
+		ptrace(PTRACE_TRACEME, 0, 0, 0);
+		kill(getpid(), SIGSTOP);
+		pr_err("Continue after SIGSTOP.. Urr what?\n");
+		exit(1);
+	}
+
+	if (waitpid(child, &stat, WUNTRACED) != child) {
+		/*
+		 * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN,
+		 * and the child has stopped already.
+		 */
+		if (!(errno == ECHILD && WIFSTOPPED(stat))) {
+			pr_perror("Failed to wait for %s() test\n", __func__);
+			goto out_kill;
+		}
+	}
+
+	if (!WIFSTOPPED(stat)) {
+		pr_err("Born child is unstoppable!\n");
+		goto out_kill;
+	}
+
+	iov.iov_base = &xsave;
+	iov.iov_len = sizeof(xsave);
+
+	if (ptrace(PTRACE_GETREGSET, child, (unsigned)NT_X86_XSTATE, &iov) < 0) {
+		pr_perror("Can't obtain FPU registers for %d", child);
+		goto out_kill;
+	}
+	/*
+	 * MXCSR should be never 0x0: e.g., it should contain either:
+	 * R+/R-/RZ/RN to determine rounding model.
+	 */
+	ret = !xsave.i387.mxcsr;
+
+out_kill:
+	kill(child, SIGKILL);
+	return ret;
+}
+
 int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs)
 {
 	CoreEntry *core = x;
diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h
index e39675d957d1..179f1942f9f8 100644
--- a/criu/arch/x86/include/asm/restorer.h
+++ b/criu/arch/x86/include/asm/restorer.h
@@ -80,8 +80,10 @@  static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len)
 # define ARCH_MAP_VDSO_64		0x2003
 #endif
 
+/* XXX: Introduce per-arch kerndat header */
 extern int kdat_compatible_cr(void);
 extern int kdat_can_map_vdso(void);
+extern int kdat_x86_has_ptrace_fpu_xsave_bug(void);
 
 static inline void
 __setup_sas_compat(struct ucontext_ia32* uc, ThreadSasEntry *sas)
diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h
index 5c87b2c350d8..9e7af14a39e5 100644
--- a/criu/include/kerndat.h
+++ b/criu/include/kerndat.h
@@ -72,6 +72,7 @@  struct kerndat_s {
 	unsigned int sysctl_nr_open;
 	unsigned long files_stat_max_files;
 	bool has_pid_for_children_ns;
+	bool x86_has_ptrace_fpu_xsave_bug;
 };
 
 extern struct kerndat_s kdat;
diff --git a/criu/kerndat.c b/criu/kerndat.c
index 68abd61805fe..b9c4fdaf402f 100644
--- a/criu/kerndat.c
+++ b/criu/kerndat.c
@@ -792,6 +792,22 @@  int kerndat_has_pid_for_children_ns(void)
 	return 0;
 }
 
+int __attribute__((weak)) kdat_x86_has_ptrace_fpu_xsave_bug(void)
+{
+	return 0;
+}
+
+static int kerndat_x86_has_ptrace_fpu_xsave_bug(void)
+{
+	int ret = kdat_x86_has_ptrace_fpu_xsave_bug();
+
+	if (ret < 0)
+		return ret;
+
+	kdat.x86_has_ptrace_fpu_xsave_bug = !!ret;
+	return 0;
+}
+
 #define KERNDAT_CACHE_FILE	KDAT_RUNDIR"/criu.kdat"
 #define KERNDAT_CACHE_FILE_TMP	KDAT_RUNDIR"/.criu.kdat"
 
@@ -1059,6 +1075,8 @@  int kerndat_init(void)
 		ret = kerndat_has_ns_get_parent();
 	if (!ret)
 		ret = kerndat_has_pid_for_children_ns();
+	if (!ret)
+		ret = kerndat_x86_has_ptrace_fpu_xsave_bug();
 
 	kerndat_lsm();
 	kerndat_mmap_min_addr();

Comments

Andrey Vagin Feb. 6, 2018, 1:40 a.m.
On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
> returns xsave without FP state part.
> 
> Signed-off-by: Dmitry Safonov <dima@arista.com>
> ---
>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
>  criu/arch/x86/include/asm/restorer.h |  2 +
>  criu/include/kerndat.h               |  1 +
>  criu/kerndat.c                       | 18 +++++++++
>  4 files changed, 92 insertions(+)
> 
> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
> index d7b21e7c9bae..a389e563a886 100644
> --- a/criu/arch/x86/crtools.c
> +++ b/criu/arch/x86/crtools.c
> @@ -7,6 +7,7 @@
>  #include <sys/syscall.h>
>  #include <sys/auxv.h>
>  #include <sys/wait.h>
> +#include <sys/ptrace.h>
>  
>  #include "types.h"
>  #include "log.h"
> @@ -30,6 +31,7 @@
>  #include "images/core.pb-c.h"
>  #include "images/creds.pb-c.h"
>  
> +/* XXX: Move all kerndat features to per-arch kerndat .c */
>  int kdat_can_map_vdso(void)
>  {
>  	pid_t child;
> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
>  }
>  #endif
>  
> +/*
> + * Pre v4.14 kernels have a bug on Skylake CPUs:
> + * copyout_from_xsaves() creates fpu state for
> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
> + * XFEATURE_MASK_FP.
> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
> + * as mxcsr store part of the state.
> + */
> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
> +{
> +	user_fpregs_struct_t xsave = { };
> +	struct iovec iov;
> +	int ret = -1;
> +	pid_t child;
> +	int stat;
> +
> +	/* OSXSAVE can't be changed during boot. */
> +	if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
> +		return 0;
> +
> +	child = fork();
> +	if (child < 0) {
> +		pr_perror("%s(): failed to fork()", __func__);
> +		return -1;
> +	}
> +
> +	if (child == 0) {
> +		ptrace(PTRACE_TRACEME, 0, 0, 0);

It can fail in a case when criu is running under strace. In this case I
would like to get a warning and conside that a kernel doesn't have this
bug.

> +		kill(getpid(), SIGSTOP);
> +		pr_err("Continue after SIGSTOP.. Urr what?\n");
> +		exit(1);
> +	}
> +
> +	if (waitpid(child, &stat, WUNTRACED) != child) {
> +		/*
> +		 * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN,
> +		 * and the child has stopped already.
> +		 */
> +		if (!(errno == ECHILD && WIFSTOPPED(stat))) {

I don't understand this expression ^^^^

> +			pr_perror("Failed to wait for %s() test\n", __func__);
> +			goto out_kill;
> +		}
> +	}
> +
> +	if (!WIFSTOPPED(stat)) {
> +		pr_err("Born child is unstoppable!\n");
> +		goto out_kill;
> +	}
> +
> +	iov.iov_base = &xsave;
> +	iov.iov_len = sizeof(xsave);
> +
> +	if (ptrace(PTRACE_GETREGSET, child, (unsigned)NT_X86_XSTATE, &iov) < 0) {
> +		pr_perror("Can't obtain FPU registers for %d", child);
> +		goto out_kill;
> +	}
> +	/*
> +	 * MXCSR should be never 0x0: e.g., it should contain either:
> +	 * R+/R-/RZ/RN to determine rounding model.
> +	 */
> +	ret = !xsave.i387.mxcsr;
> +
> +out_kill:
> +	kill(child, SIGKILL);

You need to wait a child here.

> +	return ret;
> +}
> +
>  int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs)
>  {
>  	CoreEntry *core = x;
> diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h
> index e39675d957d1..179f1942f9f8 100644
> --- a/criu/arch/x86/include/asm/restorer.h
> +++ b/criu/arch/x86/include/asm/restorer.h
> @@ -80,8 +80,10 @@ static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len)
>  # define ARCH_MAP_VDSO_64		0x2003
>  #endif
>  
> +/* XXX: Introduce per-arch kerndat header */
>  extern int kdat_compatible_cr(void);
>  extern int kdat_can_map_vdso(void);
> +extern int kdat_x86_has_ptrace_fpu_xsave_bug(void);
>  
>  static inline void
>  __setup_sas_compat(struct ucontext_ia32* uc, ThreadSasEntry *sas)
> diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h
> index 5c87b2c350d8..9e7af14a39e5 100644
> --- a/criu/include/kerndat.h
> +++ b/criu/include/kerndat.h
> @@ -72,6 +72,7 @@ struct kerndat_s {
>  	unsigned int sysctl_nr_open;
>  	unsigned long files_stat_max_files;
>  	bool has_pid_for_children_ns;
> +	bool x86_has_ptrace_fpu_xsave_bug;
>  };
>  
>  extern struct kerndat_s kdat;
> diff --git a/criu/kerndat.c b/criu/kerndat.c
> index 68abd61805fe..b9c4fdaf402f 100644
> --- a/criu/kerndat.c
> +++ b/criu/kerndat.c
> @@ -792,6 +792,22 @@ int kerndat_has_pid_for_children_ns(void)
>  	return 0;
>  }
>  
> +int __attribute__((weak)) kdat_x86_has_ptrace_fpu_xsave_bug(void)
> +{
> +	return 0;
> +}
> +
> +static int kerndat_x86_has_ptrace_fpu_xsave_bug(void)
> +{
> +	int ret = kdat_x86_has_ptrace_fpu_xsave_bug();
> +
> +	if (ret < 0)
> +		return ret;
> +
> +	kdat.x86_has_ptrace_fpu_xsave_bug = !!ret;
> +	return 0;
> +}
> +
>  #define KERNDAT_CACHE_FILE	KDAT_RUNDIR"/criu.kdat"
>  #define KERNDAT_CACHE_FILE_TMP	KDAT_RUNDIR"/.criu.kdat"
>  
> @@ -1059,6 +1075,8 @@ int kerndat_init(void)
>  		ret = kerndat_has_ns_get_parent();
>  	if (!ret)
>  		ret = kerndat_has_pid_for_children_ns();
> +	if (!ret)
> +		ret = kerndat_x86_has_ptrace_fpu_xsave_bug();
>  
>  	kerndat_lsm();
>  	kerndat_mmap_min_addr();
> -- 
> 2.13.6
>
Dmitry Safonov Feb. 6, 2018, 1:54 a.m.
2018-02-06 1:40 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
>> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
>> returns xsave without FP state part.
>>
>> Signed-off-by: Dmitry Safonov <dima@arista.com>
>> ---
>>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
>>  criu/arch/x86/include/asm/restorer.h |  2 +
>>  criu/include/kerndat.h               |  1 +
>>  criu/kerndat.c                       | 18 +++++++++
>>  4 files changed, 92 insertions(+)
>>
>> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
>> index d7b21e7c9bae..a389e563a886 100644
>> --- a/criu/arch/x86/crtools.c
>> +++ b/criu/arch/x86/crtools.c
>> @@ -7,6 +7,7 @@
>>  #include <sys/syscall.h>
>>  #include <sys/auxv.h>
>>  #include <sys/wait.h>
>> +#include <sys/ptrace.h>
>>
>>  #include "types.h"
>>  #include "log.h"
>> @@ -30,6 +31,7 @@
>>  #include "images/core.pb-c.h"
>>  #include "images/creds.pb-c.h"
>>
>> +/* XXX: Move all kerndat features to per-arch kerndat .c */
>>  int kdat_can_map_vdso(void)
>>  {
>>       pid_t child;
>> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
>>  }
>>  #endif
>>
>> +/*
>> + * Pre v4.14 kernels have a bug on Skylake CPUs:
>> + * copyout_from_xsaves() creates fpu state for
>> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
>> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
>> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
>> + * XFEATURE_MASK_FP.
>> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
>> + * as mxcsr store part of the state.
>> + */
>> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
>> +{
>> +     user_fpregs_struct_t xsave = { };
>> +     struct iovec iov;
>> +     int ret = -1;
>> +     pid_t child;
>> +     int stat;
>> +
>> +     /* OSXSAVE can't be changed during boot. */
>> +     if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
>> +             return 0;
>> +
>> +     child = fork();
>> +     if (child < 0) {
>> +             pr_perror("%s(): failed to fork()", __func__);
>> +             return -1;
>> +     }
>> +
>> +     if (child == 0) {
>> +             ptrace(PTRACE_TRACEME, 0, 0, 0);
>
> It can fail in a case when criu is running under strace. In this case I
> would like to get a warning and conside that a kernel doesn't have this
> bug.

Makes sense, will do.

>
>> +             kill(getpid(), SIGSTOP);
>> +             pr_err("Continue after SIGSTOP.. Urr what?\n");
>> +             exit(1);
>> +     }
>> +
>> +     if (waitpid(child, &stat, WUNTRACED) != child) {
>> +             /*
>> +              * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN,
>> +              * and the child has stopped already.
>> +              */
>> +             if (!(errno == ECHILD && WIFSTOPPED(stat))) {
>
> I don't understand this expression ^^^^

If I'm not mistaken, if criu process has SIG_IGN set for SIGCHLD
waitpid will return ECHILD, but stat will be still filled.
(criu doesn't set SIG_IGN at the moment, but I didn't want to rely
on that. Having in mind how many modes criu now has..
and for each we need kerndat to be initialized)
Anyway, I can delete this sub-check if you think it'll be better.

>
>> +                     pr_perror("Failed to wait for %s() test\n", __func__);
>> +                     goto out_kill;
>> +             }
>> +     }
>> +
>> +     if (!WIFSTOPPED(stat)) {
>> +             pr_err("Born child is unstoppable!\n");
>> +             goto out_kill;
>> +     }
>> +
>> +     iov.iov_base = &xsave;
>> +     iov.iov_len = sizeof(xsave);
>> +
>> +     if (ptrace(PTRACE_GETREGSET, child, (unsigned)NT_X86_XSTATE, &iov) < 0) {
>> +             pr_perror("Can't obtain FPU registers for %d", child);
>> +             goto out_kill;
>> +     }
>> +     /*
>> +      * MXCSR should be never 0x0: e.g., it should contain either:
>> +      * R+/R-/RZ/RN to determine rounding model.
>> +      */
>> +     ret = !xsave.i387.mxcsr;
>> +
>> +out_kill:
>> +     kill(child, SIGKILL);
>
> You need to wait a child here.

Yep, thanks for help with this, Andrey.
Andrey Vagin Feb. 6, 2018, 1:57 a.m.
On Tue, Feb 06, 2018 at 01:54:21AM +0000, Dmitry Safonov wrote:
> 2018-02-06 1:40 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> > On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
> >> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
> >> returns xsave without FP state part.
> >>
> >> Signed-off-by: Dmitry Safonov <dima@arista.com>
> >> ---
> >>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
> >>  criu/arch/x86/include/asm/restorer.h |  2 +
> >>  criu/include/kerndat.h               |  1 +
> >>  criu/kerndat.c                       | 18 +++++++++
> >>  4 files changed, 92 insertions(+)
> >>
> >> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
> >> index d7b21e7c9bae..a389e563a886 100644
> >> --- a/criu/arch/x86/crtools.c
> >> +++ b/criu/arch/x86/crtools.c
> >> @@ -7,6 +7,7 @@
> >>  #include <sys/syscall.h>
> >>  #include <sys/auxv.h>
> >>  #include <sys/wait.h>
> >> +#include <sys/ptrace.h>
> >>
> >>  #include "types.h"
> >>  #include "log.h"
> >> @@ -30,6 +31,7 @@
> >>  #include "images/core.pb-c.h"
> >>  #include "images/creds.pb-c.h"
> >>
> >> +/* XXX: Move all kerndat features to per-arch kerndat .c */
> >>  int kdat_can_map_vdso(void)
> >>  {
> >>       pid_t child;
> >> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
> >>  }
> >>  #endif
> >>
> >> +/*
> >> + * Pre v4.14 kernels have a bug on Skylake CPUs:
> >> + * copyout_from_xsaves() creates fpu state for
> >> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
> >> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
> >> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
> >> + * XFEATURE_MASK_FP.
> >> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
> >> + * as mxcsr store part of the state.
> >> + */
> >> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
> >> +{
> >> +     user_fpregs_struct_t xsave = { };
> >> +     struct iovec iov;
> >> +     int ret = -1;
> >> +     pid_t child;
> >> +     int stat;
> >> +
> >> +     /* OSXSAVE can't be changed during boot. */
> >> +     if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
> >> +             return 0;
> >> +
> >> +     child = fork();
> >> +     if (child < 0) {
> >> +             pr_perror("%s(): failed to fork()", __func__);
> >> +             return -1;
> >> +     }
> >> +
> >> +     if (child == 0) {
> >> +             ptrace(PTRACE_TRACEME, 0, 0, 0);
> >
> > It can fail in a case when criu is running under strace. In this case I
> > would like to get a warning and conside that a kernel doesn't have this
> > bug.
> 
> Makes sense, will do.
> 
> >
> >> +             kill(getpid(), SIGSTOP);
> >> +             pr_err("Continue after SIGSTOP.. Urr what?\n");
> >> +             exit(1);
> >> +     }
> >> +
> >> +     if (waitpid(child, &stat, WUNTRACED) != child) {
> >> +             /*
> >> +              * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN,
> >> +              * and the child has stopped already.
> >> +              */
> >> +             if (!(errno == ECHILD && WIFSTOPPED(stat))) {
> >
> > I don't understand this expression ^^^^
> 
> If I'm not mistaken, if criu process has SIG_IGN set for SIGCHLD
> waitpid will return ECHILD, but stat will be still filled.

No, it is impossible. waitpid will return ECHILD only if someone else
waited it.

> (criu doesn't set SIG_IGN at the moment, but I didn't want to rely
> on that. Having in mind how many modes criu now has..
> and for each we need kerndat to be initialized)
> Anyway, I can delete this sub-check if you think it'll be better.
>
Dmitry Safonov Feb. 6, 2018, 2:01 a.m.
2018-02-06 1:57 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> On Tue, Feb 06, 2018 at 01:54:21AM +0000, Dmitry Safonov wrote:
>> 2018-02-06 1:40 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
>> > On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
>> >> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
>> >> returns xsave without FP state part.
>> >>
>> >> Signed-off-by: Dmitry Safonov <dima@arista.com>
>> >> ---
>> >>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
>> >>  criu/arch/x86/include/asm/restorer.h |  2 +
>> >>  criu/include/kerndat.h               |  1 +
>> >>  criu/kerndat.c                       | 18 +++++++++
>> >>  4 files changed, 92 insertions(+)
>> >>
>> >> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
>> >> index d7b21e7c9bae..a389e563a886 100644
>> >> --- a/criu/arch/x86/crtools.c
>> >> +++ b/criu/arch/x86/crtools.c
>> >> @@ -7,6 +7,7 @@
>> >>  #include <sys/syscall.h>
>> >>  #include <sys/auxv.h>
>> >>  #include <sys/wait.h>
>> >> +#include <sys/ptrace.h>
>> >>
>> >>  #include "types.h"
>> >>  #include "log.h"
>> >> @@ -30,6 +31,7 @@
>> >>  #include "images/core.pb-c.h"
>> >>  #include "images/creds.pb-c.h"
>> >>
>> >> +/* XXX: Move all kerndat features to per-arch kerndat .c */
>> >>  int kdat_can_map_vdso(void)
>> >>  {
>> >>       pid_t child;
>> >> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
>> >>  }
>> >>  #endif
>> >>
>> >> +/*
>> >> + * Pre v4.14 kernels have a bug on Skylake CPUs:
>> >> + * copyout_from_xsaves() creates fpu state for
>> >> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
>> >> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
>> >> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
>> >> + * XFEATURE_MASK_FP.
>> >> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
>> >> + * as mxcsr store part of the state.
>> >> + */
>> >> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
>> >> +{
>> >> +     user_fpregs_struct_t xsave = { };
>> >> +     struct iovec iov;
>> >> +     int ret = -1;
>> >> +     pid_t child;
>> >> +     int stat;
>> >> +
>> >> +     /* OSXSAVE can't be changed during boot. */
>> >> +     if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
>> >> +             return 0;
>> >> +
>> >> +     child = fork();
>> >> +     if (child < 0) {
>> >> +             pr_perror("%s(): failed to fork()", __func__);
>> >> +             return -1;
>> >> +     }
>> >> +
>> >> +     if (child == 0) {
>> >> +             ptrace(PTRACE_TRACEME, 0, 0, 0);
>> >
>> > It can fail in a case when criu is running under strace. In this case I
>> > would like to get a warning and conside that a kernel doesn't have this
>> > bug.
>>
>> Makes sense, will do.
>>
>> >
>> >> +             kill(getpid(), SIGSTOP);
>> >> +             pr_err("Continue after SIGSTOP.. Urr what?\n");
>> >> +             exit(1);
>> >> +     }
>> >> +
>> >> +     if (waitpid(child, &stat, WUNTRACED) != child) {
>> >> +             /*
>> >> +              * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN,
>> >> +              * and the child has stopped already.
>> >> +              */
>> >> +             if (!(errno == ECHILD && WIFSTOPPED(stat))) {
>> >
>> > I don't understand this expression ^^^^
>>
>> If I'm not mistaken, if criu process has SIG_IGN set for SIGCHLD
>> waitpid will return ECHILD, but stat will be still filled.
>
> No, it is impossible. waitpid will return ECHILD only if someone else
> waited it.

I'm maybe misreading this, eh?

:ERRORS
:       ECHILD (for wait()) The calling process does not have any  unwaited-for
:              children.
:
:       ECHILD (for  waitpid() or waitid()) The process specified by pid (wait‐
:              pid()) or idtype and id (waitid()) does not exist or  is  not  a
:              child  of  the  calling process.  (This can happen for one's own
:              child if the action for SIGCHLD is set to SIG_IGN.  See also the
:              Linux Notes section about threads.)

>
>> (criu doesn't set SIG_IGN at the moment, but I didn't want to rely
>> on that. Having in mind how many modes criu now has..
>> and for each we need kerndat to be initialized)
>> Anyway, I can delete this sub-check if you think it'll be better.
>>
Dmitry Safonov Feb. 6, 2018, 2:52 a.m.
2018-02-06 1:40 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
>> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
>> returns xsave without FP state part.
>>
>> Signed-off-by: Dmitry Safonov <dima@arista.com>
>> ---
>>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
>>  criu/arch/x86/include/asm/restorer.h |  2 +
>>  criu/include/kerndat.h               |  1 +
>>  criu/kerndat.c                       | 18 +++++++++
>>  4 files changed, 92 insertions(+)
>>
>> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
>> index d7b21e7c9bae..a389e563a886 100644
>> --- a/criu/arch/x86/crtools.c
>> +++ b/criu/arch/x86/crtools.c
>> @@ -7,6 +7,7 @@
>>  #include <sys/syscall.h>
>>  #include <sys/auxv.h>
>>  #include <sys/wait.h>
>> +#include <sys/ptrace.h>
>>
>>  #include "types.h"
>>  #include "log.h"
>> @@ -30,6 +31,7 @@
>>  #include "images/core.pb-c.h"
>>  #include "images/creds.pb-c.h"
>>
>> +/* XXX: Move all kerndat features to per-arch kerndat .c */
>>  int kdat_can_map_vdso(void)
>>  {
>>       pid_t child;
>> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
>>  }
>>  #endif
>>
>> +/*
>> + * Pre v4.14 kernels have a bug on Skylake CPUs:
>> + * copyout_from_xsaves() creates fpu state for
>> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
>> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
>> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
>> + * XFEATURE_MASK_FP.
>> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
>> + * as mxcsr store part of the state.
>> + */
>> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
>> +{
>> +     user_fpregs_struct_t xsave = { };
>> +     struct iovec iov;
>> +     int ret = -1;
>> +     pid_t child;
>> +     int stat;
>> +
>> +     /* OSXSAVE can't be changed during boot. */
>> +     if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
>> +             return 0;
>> +
>> +     child = fork();
>> +     if (child < 0) {
>> +             pr_perror("%s(): failed to fork()", __func__);
>> +             return -1;
>> +     }
>> +
>> +     if (child == 0) {
>> +             ptrace(PTRACE_TRACEME, 0, 0, 0);
>
> It can fail in a case when criu is running under strace. In this case I
> would like to get a warning and conside that a kernel doesn't have this
> bug.

I actually think after fixing that it might be better to consider that kernel
*has* this bug. It's safer because two syscall would work if bug is present
or not. So it's just one syscall more (per-thread) for the unlikely case when
someone firstly filled kerndat under ptrace(). This way we will not end up
debugging this bug again. Makes sense?
Andrey Vagin Feb. 6, 2018, 4:31 a.m.
On Tue, Feb 06, 2018 at 02:52:05AM +0000, Dmitry Safonov wrote:
> 2018-02-06 1:40 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> > On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
> >> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
> >> returns xsave without FP state part.
> >>
> >> Signed-off-by: Dmitry Safonov <dima@arista.com>
> >> ---
> >>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
> >>  criu/arch/x86/include/asm/restorer.h |  2 +
> >>  criu/include/kerndat.h               |  1 +
> >>  criu/kerndat.c                       | 18 +++++++++
> >>  4 files changed, 92 insertions(+)
> >>
> >> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
> >> index d7b21e7c9bae..a389e563a886 100644
> >> --- a/criu/arch/x86/crtools.c
> >> +++ b/criu/arch/x86/crtools.c
> >> @@ -7,6 +7,7 @@
> >>  #include <sys/syscall.h>
> >>  #include <sys/auxv.h>
> >>  #include <sys/wait.h>
> >> +#include <sys/ptrace.h>
> >>
> >>  #include "types.h"
> >>  #include "log.h"
> >> @@ -30,6 +31,7 @@
> >>  #include "images/core.pb-c.h"
> >>  #include "images/creds.pb-c.h"
> >>
> >> +/* XXX: Move all kerndat features to per-arch kerndat .c */
> >>  int kdat_can_map_vdso(void)
> >>  {
> >>       pid_t child;
> >> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
> >>  }
> >>  #endif
> >>
> >> +/*
> >> + * Pre v4.14 kernels have a bug on Skylake CPUs:
> >> + * copyout_from_xsaves() creates fpu state for
> >> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
> >> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
> >> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
> >> + * XFEATURE_MASK_FP.
> >> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
> >> + * as mxcsr store part of the state.
> >> + */
> >> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
> >> +{
> >> +     user_fpregs_struct_t xsave = { };
> >> +     struct iovec iov;
> >> +     int ret = -1;
> >> +     pid_t child;
> >> +     int stat;
> >> +
> >> +     /* OSXSAVE can't be changed during boot. */
> >> +     if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
> >> +             return 0;
> >> +
> >> +     child = fork();
> >> +     if (child < 0) {
> >> +             pr_perror("%s(): failed to fork()", __func__);
> >> +             return -1;
> >> +     }
> >> +
> >> +     if (child == 0) {
> >> +             ptrace(PTRACE_TRACEME, 0, 0, 0);
> >
> > It can fail in a case when criu is running under strace. In this case I
> > would like to get a warning and conside that a kernel doesn't have this
> > bug.
> 
> I actually think after fixing that it might be better to consider that kernel
> *has* this bug. It's safer because two syscall would work if bug is present
> or not. So it's just one syscall more (per-thread) for the unlikely case when
> someone firstly filled kerndat under ptrace(). This way we will not end up
> debugging this bug again. Makes sense?

Yes. Let's do this way.
> 
> -- 
>              Dmitry
Andrey Vagin Feb. 6, 2018, 6:45 p.m.
On Tue, Feb 06, 2018 at 02:01:41AM +0000, Dmitry Safonov wrote:
> 2018-02-06 1:57 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> > On Tue, Feb 06, 2018 at 01:54:21AM +0000, Dmitry Safonov wrote:
> >> 2018-02-06 1:40 GMT+00:00 Andrei Vagin <avagin@virtuozzo.com>:
> >> > On Wed, Jan 31, 2018 at 04:58:09PM +0000, Dmitry Safonov wrote:
> >> >> We need to know if ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, iov)
> >> >> returns xsave without FP state part.
> >> >>
> >> >> Signed-off-by: Dmitry Safonov <dima@arista.com>
> >> >> ---
> >> >>  criu/arch/x86/crtools.c              | 71 ++++++++++++++++++++++++++++++++++++
> >> >>  criu/arch/x86/include/asm/restorer.h |  2 +
> >> >>  criu/include/kerndat.h               |  1 +
> >> >>  criu/kerndat.c                       | 18 +++++++++
> >> >>  4 files changed, 92 insertions(+)
> >> >>
> >> >> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
> >> >> index d7b21e7c9bae..a389e563a886 100644
> >> >> --- a/criu/arch/x86/crtools.c
> >> >> +++ b/criu/arch/x86/crtools.c
> >> >> @@ -7,6 +7,7 @@
> >> >>  #include <sys/syscall.h>
> >> >>  #include <sys/auxv.h>
> >> >>  #include <sys/wait.h>
> >> >> +#include <sys/ptrace.h>
> >> >>
> >> >>  #include "types.h"
> >> >>  #include "log.h"
> >> >> @@ -30,6 +31,7 @@
> >> >>  #include "images/core.pb-c.h"
> >> >>  #include "images/creds.pb-c.h"
> >> >>
> >> >> +/* XXX: Move all kerndat features to per-arch kerndat .c */
> >> >>  int kdat_can_map_vdso(void)
> >> >>  {
> >> >>       pid_t child;
> >> >> @@ -188,6 +190,75 @@ int kdat_compatible_cr(void)
> >> >>  }
> >> >>  #endif
> >> >>
> >> >> +/*
> >> >> + * Pre v4.14 kernels have a bug on Skylake CPUs:
> >> >> + * copyout_from_xsaves() creates fpu state for
> >> >> + *   ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov)
> >> >> + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state.
> >> >> + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not
> >> >> + * XFEATURE_MASK_FP.
> >> >> + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active,
> >> >> + * as mxcsr store part of the state.
> >> >> + */
> >> >> +int kdat_x86_has_ptrace_fpu_xsave_bug(void)
> >> >> +{
> >> >> +     user_fpregs_struct_t xsave = { };
> >> >> +     struct iovec iov;
> >> >> +     int ret = -1;
> >> >> +     pid_t child;
> >> >> +     int stat;
> >> >> +
> >> >> +     /* OSXSAVE can't be changed during boot. */
> >> >> +     if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE))
> >> >> +             return 0;
> >> >> +
> >> >> +     child = fork();
> >> >> +     if (child < 0) {
> >> >> +             pr_perror("%s(): failed to fork()", __func__);
> >> >> +             return -1;
> >> >> +     }
> >> >> +
> >> >> +     if (child == 0) {
> >> >> +             ptrace(PTRACE_TRACEME, 0, 0, 0);
> >> >
> >> > It can fail in a case when criu is running under strace. In this case I
> >> > would like to get a warning and conside that a kernel doesn't have this
> >> > bug.
> >>
> >> Makes sense, will do.
> >>
> >> >
> >> >> +             kill(getpid(), SIGSTOP);
> >> >> +             pr_err("Continue after SIGSTOP.. Urr what?\n");
> >> >> +             exit(1);
> >> >> +     }
> >> >> +
> >> >> +     if (waitpid(child, &stat, WUNTRACED) != child) {
> >> >> +             /*
> >> >> +              * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN,
> >> >> +              * and the child has stopped already.
> >> >> +              */
> >> >> +             if (!(errno == ECHILD && WIFSTOPPED(stat))) {
> >> >
> >> > I don't understand this expression ^^^^
> >>
> >> If I'm not mistaken, if criu process has SIG_IGN set for SIGCHLD
> >> waitpid will return ECHILD, but stat will be still filled.
> >
> > No, it is impossible. waitpid will return ECHILD only if someone else
> > waited it.
> 
> I'm maybe misreading this, eh?
> 
> :ERRORS
> :       ECHILD (for wait()) The calling process does not have any  unwaited-for
> :              children.
> :
> :       ECHILD (for  waitpid() or waitid()) The process specified by pid (wait‐
> :              pid()) or idtype and id (waitid()) does not exist or  is  not  a
> :              child  of  the  calling process.  (This can happen for one's own
> :              child if the action for SIGCHLD is set to SIG_IGN.  See also the
> :              Linux Notes section about threads.)
>

Just for the history. Dima was partly right. If we set SIG_IGN for
SIGCHLD, the kernel is auto-reaping zombies and waitpid() returns
ECHILD, but status isn't initialized in this case.
 
> >
> >> (criu doesn't set SIG_IGN at the moment, but I didn't want to rely
> >> on that. Having in mind how many modes criu now has..
> >> and for each we need kerndat to be initialized)
> >> Anyway, I can delete this sub-check if you think it'll be better.
> >>
> 
> 
> 
> -- 
>              Dmitry