[01/11] cr-check: Add check for mremap() of special mappings

Submitted by Dmitry Safonov on May 22, 2019, 6:18 p.m.

Details

Message ID 20190522181825.15064-2-dima@arista.com
State New
Series "vdso: arm32 support"
Headers show

Commit Message

Dmitry Safonov May 22, 2019, 6:18 p.m.
During restore any VMA that's a subject to ASLR should be moved at the
same address as was on a checkpoint. Previously, ports to non-x86
architectures had problems with VDSO mremap(). On those platforms kernel
needs "landing" for return to userspace in some cases.
Usually, vdso provides this landing and finishes restoring of registers.
That's `int80_landing_pad` on ia32. On arm64/arm32 it's sigtrap for
SA_RESTORER - to proceed after signal processing.

That's why kernel needs to track the position of landing.
On modern kernels for platform we support it's already done - however,
for older kernels some patches needs to be backported for C/R.

Provide the checks for mremap() of special VMAs: that CRIU has suitable
kernel to work on and if we'll have some new platforms - that kernel
tracks the position of landing.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 criu/cr-check.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-check.c b/criu/cr-check.c
index e24668305838..54fa44091ad8 100644
--- a/criu/cr-check.c
+++ b/criu/cr-check.c
@@ -582,7 +582,7 @@  static pid_t fork_and_ptrace_attach(int (*child_setup)(void))
 	return pid;
 }
 
-static int check_ptrace_peeksiginfo()
+static int check_ptrace_peeksiginfo(void)
 {
 	struct ptrace_peeksiginfo_args arg;
 	siginfo_t siginfo;
@@ -611,6 +611,152 @@  static int check_ptrace_peeksiginfo()
 	return ret;
 }
 
+struct special_mapping {
+	const char	*name;
+	void		*addr;
+	size_t		size;
+};
+
+static int parse_special_maps(struct special_mapping *vmas, size_t nr)
+{
+	FILE *maps;
+	char buf[256];
+	int ret = 0;
+
+	maps = fopen_proc(PROC_SELF, "maps");
+	if (!maps)
+		return -1;
+
+	while (fgets(buf, sizeof(buf), maps)) {
+		unsigned long start, end;
+		int r, tail;
+		size_t i;
+
+		r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n",
+				&start, &end, &tail);
+		if (r != 2) {
+			fclose(maps);
+			pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail);
+			return -1;
+		}
+
+		for (i = 0; i < nr; i++) {
+			if (strcmp(buf + tail, vmas[i].name) != 0)
+				continue;
+			if (vmas[i].addr != MAP_FAILED) {
+				pr_err("Special mapping meet twice: %s\n", vmas[i].name);
+				ret = -1;
+				goto out;
+			}
+			vmas[i].addr = (void *)start;
+			vmas[i].size = end - start;
+		}
+	}
+
+out:
+	fclose(maps);
+	return ret;
+}
+
+static void dummy_sighandler(int sig)
+{
+}
+
+static void check_special_mapping_mremap_child(struct special_mapping *vmas,
+					       size_t nr)
+{
+	size_t i, parking_size = 0;
+	void *parking_lot;
+	pid_t self = getpid();
+
+	for (i = 0; i < nr; i++) {
+		if (vmas[i].addr != MAP_FAILED)
+			parking_size += vmas[i].size;
+	}
+
+	if (signal(SIGUSR1, dummy_sighandler) == SIG_ERR) {
+		pr_perror("signal() failed");
+		exit(1);
+	}
+
+	parking_lot = mmap(NULL, parking_size, PROT_NONE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (parking_lot == MAP_FAILED) {
+		pr_perror("mmap(%zu) failed", parking_size);
+		exit(1);
+	}
+
+	for (i = 0; i < nr; i++) {
+		unsigned long ret;
+
+		if (vmas[i].addr == MAP_FAILED)
+			continue;
+
+		ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr,
+			      vmas[i].size, vmas[i].size,
+			      MREMAP_FIXED | MREMAP_MAYMOVE,
+			      (unsigned long)parking_lot);
+		if (ret != (unsigned long)parking_lot)
+			syscall(__NR_exit, 1);
+		parking_lot += vmas[i].size;
+	}
+
+	syscall(__NR_kill, self, SIGUSR1);
+	syscall(__NR_exit, 0);
+}
+
+static int check_special_mapping_mremap(void)
+{
+	struct special_mapping special_vmas[] = {
+		{
+			.name = "[vvar]\n",
+			.addr = MAP_FAILED,
+		},
+		{
+			.name = "[vdso]\n",
+			.addr = MAP_FAILED,
+		},
+		{
+			.name = "[sigpage]\n",
+			.addr = MAP_FAILED,
+		},
+		/* XXX: { .name = "[uprobes]\n" }, */
+		/*
+		 * Not subjects for ASLR, skipping:
+		 * { .name = "[vectors]\n", },
+		 * { .name = "[vsyscall]\n" },
+		 */
+	};
+	size_t vmas_nr = ARRAY_SIZE(special_vmas);
+	pid_t child;
+	int stat;
+
+	if (parse_special_maps(special_vmas, vmas_nr))
+		return -1;
+
+	child = fork();
+	if (child < 0) {
+		pr_perror("%s(): failed to fork()", __func__);
+		return -1;
+	}
+
+	if (child == 0)
+		check_special_mapping_mremap_child(special_vmas, vmas_nr);
+
+	if (waitpid(child, &stat, 0) != child) {
+		pr_err("Failed to wait for special mapping mremap() test\n");
+		kill(child, SIGKILL);
+		return -1;
+	}
+
+	if (!WIFEXITED(stat) || WEXITSTATUS(stat) != 0) {
+		pr_err("mremap() for special mapping did something bad to a child\n");
+		return -1;
+	}
+
+	return 0;
+}
+
 static int check_ptrace_suspend_seccomp(void)
 {
 	pid_t pid;
@@ -1172,6 +1318,7 @@  int cr_check(void)
 	CHECK_CAT1(check_ipc());
 	CHECK_CAT1(check_sigqueuinfo());
 	CHECK_CAT1(check_ptrace_peeksiginfo());
+	CHECK_CAT1(check_special_mapping_mremap());
 
 	/*
 	 * Category 2 - required for specific cases.

Comments

Andrei Vagin May 25, 2019, 5:42 a.m.
On Wed, May 22, 2019 at 07:18:15PM +0100, Dmitry Safonov wrote:
> During restore any VMA that's a subject to ASLR should be moved at the
> same address as was on a checkpoint. Previously, ports to non-x86
> architectures had problems with VDSO mremap(). On those platforms kernel
> needs "landing" for return to userspace in some cases.
> Usually, vdso provides this landing and finishes restoring of registers.
> That's `int80_landing_pad` on ia32. On arm64/arm32 it's sigtrap for
> SA_RESTORER - to proceed after signal processing.
> 
> That's why kernel needs to track the position of landing.
> On modern kernels for platform we support it's already done - however,
> for older kernels some patches needs to be backported for C/R.
> 
> Provide the checks for mremap() of special VMAs: that CRIU has suitable
> kernel to work on and if we'll have some new platforms - that kernel
> tracks the position of landing.
> 
> Signed-off-by: Dmitry Safonov <dima@arista.com>
> ---
>  criu/cr-check.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 148 insertions(+), 1 deletion(-)
> 
> diff --git a/criu/cr-check.c b/criu/cr-check.c
> index e24668305838..54fa44091ad8 100644
> --- a/criu/cr-check.c
> +++ b/criu/cr-check.c
> @@ -582,7 +582,7 @@ static pid_t fork_and_ptrace_attach(int (*child_setup)(void))
>  	return pid;
>  }
>  
> -static int check_ptrace_peeksiginfo()
> +static int check_ptrace_peeksiginfo(void)
>  {
>  	struct ptrace_peeksiginfo_args arg;
>  	siginfo_t siginfo;
> @@ -611,6 +611,152 @@ static int check_ptrace_peeksiginfo()
>  	return ret;
>  }
>  
> +struct special_mapping {
> +	const char	*name;
> +	void		*addr;
> +	size_t		size;
> +};
> +
> +static int parse_special_maps(struct special_mapping *vmas, size_t nr)
> +{
> +	FILE *maps;
> +	char buf[256];
> +	int ret = 0;
> +
> +	maps = fopen_proc(PROC_SELF, "maps");
> +	if (!maps)
> +		return -1;
> +
> +	while (fgets(buf, sizeof(buf), maps)) {
> +		unsigned long start, end;
> +		int r, tail;
> +		size_t i;
> +
> +		r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n",
> +				&start, &end, &tail);
> +		if (r != 2) {
> +			fclose(maps);
> +			pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail);
> +			return -1;
> +		}
> +
> +		for (i = 0; i < nr; i++) {
> +			if (strcmp(buf + tail, vmas[i].name) != 0)
> +				continue;
> +			if (vmas[i].addr != MAP_FAILED) {
> +				pr_err("Special mapping meet twice: %s\n", vmas[i].name);
> +				ret = -1;
> +				goto out;
> +			}
> +			vmas[i].addr = (void *)start;
> +			vmas[i].size = end - start;
> +		}
> +	}
> +
> +out:
> +	fclose(maps);
> +	return ret;
> +}
> +
> +static void dummy_sighandler(int sig)
> +{
> +}
> +
> +static void check_special_mapping_mremap_child(struct special_mapping *vmas,
> +					       size_t nr)
> +{
> +	size_t i, parking_size = 0;
> +	void *parking_lot;
> +	pid_t self = getpid();
> +
> +	for (i = 0; i < nr; i++) {
> +		if (vmas[i].addr != MAP_FAILED)
> +			parking_size += vmas[i].size;
> +	}
> +

Could you write a comment why we need to handle SIGUSR1 here?

> +	if (signal(SIGUSR1, dummy_sighandler) == SIG_ERR) {
> +		pr_perror("signal() failed");
> +		exit(1);
> +	}
> +
> +	parking_lot = mmap(NULL, parking_size, PROT_NONE,
> +			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +	if (parking_lot == MAP_FAILED) {
> +		pr_perror("mmap(%zu) failed", parking_size);
> +		exit(1);
> +	}
> +
> +	for (i = 0; i < nr; i++) {
> +		unsigned long ret;
> +
> +		if (vmas[i].addr == MAP_FAILED)
> +			continue;
> +
> +		ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr,
> +			      vmas[i].size, vmas[i].size,
> +			      MREMAP_FIXED | MREMAP_MAYMOVE,
> +			      (unsigned long)parking_lot);
> +		if (ret != (unsigned long)parking_lot)

if it fails, we probably can log this error

> +			syscall(__NR_exit, 1);
> +		parking_lot += vmas[i].size;
> +	}
> +
> +	syscall(__NR_kill, self, SIGUSR1);
> +	syscall(__NR_exit, 0);
> +}
> +
> +static int check_special_mapping_mremap(void)
> +{
> +	struct special_mapping special_vmas[] = {
> +		{
> +			.name = "[vvar]\n",
> +			.addr = MAP_FAILED,
> +		},
> +		{
> +			.name = "[vdso]\n",
> +			.addr = MAP_FAILED,
> +		},
> +		{
> +			.name = "[sigpage]\n",
> +			.addr = MAP_FAILED,
> +		},
> +		/* XXX: { .name = "[uprobes]\n" }, */
> +		/*
> +		 * Not subjects for ASLR, skipping:
> +		 * { .name = "[vectors]\n", },
> +		 * { .name = "[vsyscall]\n" },
> +		 */
> +	};
> +	size_t vmas_nr = ARRAY_SIZE(special_vmas);
> +	pid_t child;
> +	int stat;
> +
> +	if (parse_special_maps(special_vmas, vmas_nr))
> +		return -1;
> +
> +	child = fork();
> +	if (child < 0) {
> +		pr_perror("%s(): failed to fork()", __func__);
> +		return -1;
> +	}
> +
> +	if (child == 0) {
> +		check_special_mapping_mremap_child(special_vmas, vmas_nr);
		exit(1); /* unreachable */
	}
> +
> +	if (waitpid(child, &stat, 0) != child) {
> +		pr_err("Failed to wait for special mapping mremap() test\n");
> +		kill(child, SIGKILL);
if waitpid failed, we probably doesn't have this child, so you would
prefer to not kill a process with this pid.
> +		return -1;
> +	}
> +
> +	if (!WIFEXITED(stat) || WEXITSTATUS(stat) != 0) {
> +		pr_err("mremap() for special mapping did something bad to a child\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
>  static int check_ptrace_suspend_seccomp(void)
>  {
>  	pid_t pid;
> @@ -1172,6 +1318,7 @@ int cr_check(void)
>  	CHECK_CAT1(check_ipc());
>  	CHECK_CAT1(check_sigqueuinfo());
>  	CHECK_CAT1(check_ptrace_peeksiginfo());
> +	CHECK_CAT1(check_special_mapping_mremap());
>  
>  	/*
>  	 * Category 2 - required for specific cases.
> -- 
> 2.21.0
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Dmitry Safonov May 27, 2019, 2:21 p.m.
On 5/25/19 6:42 AM, Andrei Vagin wrote:
> On Wed, May 22, 2019 at 07:18:15PM +0100, Dmitry Safonov wrote:
[..]
>> +static void check_special_mapping_mremap_child(struct special_mapping *vmas,
>> +					       size_t nr)
>> +{
>> +	size_t i, parking_size = 0;
>> +	void *parking_lot;
>> +	pid_t self = getpid();
>> +
>> +	for (i = 0; i < nr; i++) {
>> +		if (vmas[i].addr != MAP_FAILED)
>> +			parking_size += vmas[i].size;
>> +	}
>> +
> 
> Could you write a comment why we need to handle SIGUSR1 here?

Sure.

> 
>> +	if (signal(SIGUSR1, dummy_sighandler) == SIG_ERR) {
>> +		pr_perror("signal() failed");
>> +		exit(1);
>> +	}
>> +
>> +	parking_lot = mmap(NULL, parking_size, PROT_NONE,
>> +			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>> +	if (parking_lot == MAP_FAILED) {
>> +		pr_perror("mmap(%zu) failed", parking_size);
>> +		exit(1);
>> +	}
>> +
>> +	for (i = 0; i < nr; i++) {
>> +		unsigned long ret;
>> +
>> +		if (vmas[i].addr == MAP_FAILED)
>> +			continue;
>> +
>> +		ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr,
>> +			      vmas[i].size, vmas[i].size,
>> +			      MREMAP_FIXED | MREMAP_MAYMOVE,
>> +			      (unsigned long)parking_lot);
>> +		if (ret != (unsigned long)parking_lot)
> 
> if it fails, we probably can log this error

It would be ugly at this place as it needs raw sys_write().
Probably, it will be prettier if I just print exit status in the parent.

[..]
>> +	child = fork();
>> +	if (child < 0) {
>> +		pr_perror("%s(): failed to fork()", __func__);
>> +		return -1;
>> +	}
>> +
>> +	if (child == 0) {
>> +		check_special_mapping_mremap_child(special_vmas, vmas_nr);
> 		exit(1); /* unreachable */
> 	}
>> +
>> +	if (waitpid(child, &stat, 0) != child) {
>> +		pr_err("Failed to wait for special mapping mremap() test\n");
>> +		kill(child, SIGKILL);
> if waitpid failed, we probably doesn't have this child, so you would
> prefer to not kill a process with this pid.

Yeah, I guess I'll convert it to ECHILD check and exit without shooting
somebody.