[v5,4/8] memory: don't use parent memdump if detected possible pid reuse

Submitted by Pavel Tikhomirov on Feb. 16, 2018, 7:55 a.m.

Details

Message ID 20180216075550.8427-1-ptikhomirov@virtuozzo.com
State New
Series "don't use wrong pagemap (from other task) on pid reuse"
Headers show

Commit Message

Pavel Tikhomirov Feb. 16, 2018, 7:55 a.m.
From: ptikhomirov <ptikhomirov@virtuozzo.com>

We have a problem when a pid is reused between consequent dumps we can't
understand if pagemap and pages from images of parent dump are invalid
to restore these pid already. That can lead even to wrong memory
restored for these pid, see the test in last patch.

So these is a try do separate processes with (likely) invalid previous
memory dump from processes with 100% valid previous dump.

For that we use the value of /proc/<pid>/stat's start_time and also the
timestamp of each (pre)dump. If the start time is strictly less than the
timestamp, that means that the pagemap for these pid from previous dump
is valid - was done for exactly the same process.

Creation time is in centiseconds by default so if predump is really fast
(<1csec) we can have false negative decisions for some processes, but in
case of long running processes we are fine.

https://jira.sw.ru/browse/PSBM-67502

v2: remove __maybe_unused for get_parent_stats; fix get_parent_stats to
have static typing; print warning only if unsure; check has_dump_uptime
v3: read parent stats from image only once; reuse stat from previous
parse_pid_stat call on dump
v4: move code to function; use unsigned long long for ticks; put
proc_pid_stat on mem_dump_ctl; print warning on all pid-reuse cases
v5: free parent's stats entry properly, pass it in arguments to
(pre_)dump_one_task

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 criu/cr-dump.c     | 26 +++++++++++++++++++++----
 criu/include/mem.h |  9 +++++++--
 criu/mem.c         | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 criu/stats.c       |  2 +-
 4 files changed, 86 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 9e8425504..354818fb4 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1229,7 +1229,7 @@  static int assign_parasite_pids(struct pstree_item *item, struct parasite_dump_m
 	return 0;
 }
 
-static int pre_dump_one_task(struct pstree_item *item)
+static int pre_dump_one_task(struct pstree_item *item, StatsEntry *parent_se)
 {
 	pid_t pid = item->pid->real;
 	struct vm_area_list vmas;
@@ -1289,6 +1289,8 @@  static int pre_dump_one_task(struct pstree_item *item)
 
 	mdc.pre_dump = true;
 	mdc.lazy = false;
+	mdc.stat = NULL;
+	mdc.parent_se = parent_se;
 
 	ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
 	if (ret)
@@ -1307,7 +1309,7 @@  static int pre_dump_one_task(struct pstree_item *item)
 	goto err_free;
 }
 
-static int dump_one_task(struct pstree_item *item)
+static int dump_one_task(struct pstree_item *item, StatsEntry *parent_se)
 {
 	pid_t pid = item->pid->real;
 	struct vm_area_list vmas;
@@ -1448,6 +1450,8 @@  static int dump_one_task(struct pstree_item *item)
 
 	mdc.pre_dump = false;
 	mdc.lazy = opts.lazy_pages;
+	mdc.stat = &pps_buf;
+	mdc.parent_se = parent_se;
 
 	ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
 	if (ret)
@@ -1635,6 +1639,7 @@  static int cr_pre_dump_finish(int ret)
 int cr_pre_dump_tasks(pid_t pid)
 {
 	struct pstree_item *item;
+	StatsEntry *parent_se;
 	int ret = -1;
 
 	if (opts.remote && push_snapshot_id() < 0) {
@@ -1698,10 +1703,16 @@  int cr_pre_dump_tasks(pid_t pid)
 	if (collect_namespaces(false) < 0)
 		goto err;
 
+	/* Errors handled later in detect_pid_reuse */
+	parent_se = get_parent_stats();
+
 	for_each_pstree_item(item)
-		if (pre_dump_one_task(item))
+		if (pre_dump_one_task(item, parent_se))
 			goto err;
 
+	if (parent_se)
+		stats_entry__free_unpacked(parent_se, NULL);
+
 	ret = cr_dump_shmem();
 	if (ret)
 		goto err;
@@ -1832,6 +1843,7 @@  int cr_dump_tasks(pid_t pid)
 {
 	InventoryEntry he = INVENTORY_ENTRY__INIT;
 	struct pstree_item *item;
+	StatsEntry *parent_se;
 	int pre_dump_ret = 0;
 	int ret = -1;
 
@@ -1925,11 +1937,17 @@  int cr_dump_tasks(pid_t pid)
 	if (collect_seccomp_filters() < 0)
 		goto err;
 
+	/* Errors handled later in detect_pid_reuse */
+	parent_se = get_parent_stats();
+
 	for_each_pstree_item(item) {
-		if (dump_one_task(item))
+		if (dump_one_task(item, parent_se))
 			goto err;
 	}
 
+	if (parent_se)
+		stats_entry__free_unpacked(parent_se, NULL);
+
 	/*
 	 * It may happen that a process has completed but its files in
 	 * /proc/PID/ are still open by another process. If the PID has been
diff --git a/criu/include/mem.h b/criu/include/mem.h
index bb897c599..e61e175a8 100644
--- a/criu/include/mem.h
+++ b/criu/include/mem.h
@@ -4,6 +4,9 @@ 
 #include <stdbool.h>
 #include "int.h"
 #include "vma.pb-c.h"
+#include "pid.h"
+#include "proc_parse.h"
+#include "stats.pb-c.h"
 
 struct parasite_ctl;
 struct vm_area_list;
@@ -12,8 +15,10 @@  struct pstree_item;
 struct vma_area;
 
 struct mem_dump_ctl {
-	bool	pre_dump;
-	bool	lazy;
+	bool			pre_dump;
+	bool			lazy;
+	struct proc_pid_stat	*stat;
+	StatsEntry		*parent_se;
 };
 
 extern bool vma_has_guard_gap_hidden(struct vma_area *vma);
diff --git a/criu/mem.c b/criu/mem.c
index 4c6942a11..63df25d54 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -30,9 +30,11 @@ 
 #include "fault-injection.h"
 #include "prctl.h"
 #include <compel/compel.h>
+#include "proc_parse.h"
 
 #include "protobuf.h"
 #include "images/pagemap.pb-c.h"
+#include "images/stats.pb-c.h"
 
 static int task_reset_dirty_track(int pid)
 {
@@ -290,6 +292,50 @@  static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
 	return ret;
 }
 
+static int detect_pid_reuse(struct pstree_item *item,
+			    struct proc_pid_stat* pps,
+			    StatsEntry *parent_se)
+{
+	struct proc_pid_stat pps_buf;
+	unsigned long long tps; /* ticks per second */
+	int ret;
+
+	tps = sysconf(_SC_CLK_TCK);
+	if (tps == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
+	if (!pps) {
+		pps = &pps_buf;
+		ret = parse_pid_stat(item->pid->real, pps);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (!parent_se) {
+		pr_perror("No parent stats, see errors in get_parent_stats");
+		return -1;
+	}
+
+	if (parent_se->dump->has_dump_uptime) {
+		unsigned long long dump_ticks;
+
+		dump_ticks = parent_se->dump->dump_uptime/(USEC_PER_SEC/tps);
+
+		if (pps->start_time >= dump_ticks) {
+			/* Print "*" if unsure */
+			pr_warn("Pid reuse%s detected for pid %d\n",
+				pps_buf.start_time == dump_ticks ? "*" : "",
+				item->pid->real);
+			return 1;
+		}
+	} else
+		pr_warn_once("Parent image has no dump timestamp, " \
+			     "pid reuse detection is OFF!\n");
+	return 0;
+}
+
 static int __parasite_dump_pages_seized(struct pstree_item *item,
 		struct parasite_dump_pages_args *args,
 		struct vm_area_list *vma_area_list,
@@ -303,6 +349,7 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	int ret = -1;
 	unsigned cpp_flags = 0;
 	unsigned long pmc_size;
+	int possible_pid_reuse = 0;
 
 	if (opts.check_only)
 		return 0;
@@ -360,6 +407,14 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 			xfer.parent = NULL + 1;
 	}
 
+	if (xfer.parent) {
+		possible_pid_reuse = detect_pid_reuse(item, mdc->stat,
+						      mdc->parent_se);
+		if (possible_pid_reuse == -1)
+			goto out_xfer;
+	}
+
+
 	/*
 	 * Step 1 -- generate the pagemap
 	 */
@@ -386,7 +441,7 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 		else {
 again:
 			ret = generate_iovs(vma_area, pp, map, &off,
-				has_parent);
+				has_parent && !possible_pid_reuse);
 			if (ret == -EAGAIN) {
 				BUG_ON(!(pp->flags & PP_CHUNK_MODE));
 
diff --git a/criu/stats.c b/criu/stats.c
index 4823e48de..4d646c104 100644
--- a/criu/stats.c
+++ b/criu/stats.c
@@ -213,7 +213,7 @@  void write_stats(int what)
 		display_stats(what, &stats);
 }
 
-__maybe_unused StatsEntry *get_parent_stats(void)
+StatsEntry *get_parent_stats(void)
 {
 	struct cr_img *img;
 	StatsEntry *se;

Comments

Pavel Tikhomirov Feb. 16, 2018, 8:16 a.m.
Please drop v5.

On 02/16/2018 10:55 AM, Pavel Tikhomirov wrote:
> From: ptikhomirov <ptikhomirov@virtuozzo.com>
> 
> We have a problem when a pid is reused between consequent dumps we can't
> understand if pagemap and pages from images of parent dump are invalid
> to restore these pid already. That can lead even to wrong memory
> restored for these pid, see the test in last patch.
> 
> So these is a try do separate processes with (likely) invalid previous
> memory dump from processes with 100% valid previous dump.
> 
> For that we use the value of /proc/<pid>/stat's start_time and also the
> timestamp of each (pre)dump. If the start time is strictly less than the
> timestamp, that means that the pagemap for these pid from previous dump
> is valid - was done for exactly the same process.
> 
> Creation time is in centiseconds by default so if predump is really fast
> (<1csec) we can have false negative decisions for some processes, but in
> case of long running processes we are fine.
> 
> https://jira.sw.ru/browse/PSBM-67502
> 
> v2: remove __maybe_unused for get_parent_stats; fix get_parent_stats to
> have static typing; print warning only if unsure; check has_dump_uptime
> v3: read parent stats from image only once; reuse stat from previous
> parse_pid_stat call on dump
> v4: move code to function; use unsigned long long for ticks; put
> proc_pid_stat on mem_dump_ctl; print warning on all pid-reuse cases
> v5: free parent's stats entry properly, pass it in arguments to
> (pre_)dump_one_task
> 
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> ---
>   criu/cr-dump.c     | 26 +++++++++++++++++++++----
>   criu/include/mem.h |  9 +++++++--
>   criu/mem.c         | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   criu/stats.c       |  2 +-
>   4 files changed, 86 insertions(+), 8 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 9e8425504..354818fb4 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1229,7 +1229,7 @@ static int assign_parasite_pids(struct pstree_item *item, struct parasite_dump_m
>   	return 0;
>   }
>   
> -static int pre_dump_one_task(struct pstree_item *item)
> +static int pre_dump_one_task(struct pstree_item *item, StatsEntry *parent_se)
>   {
>   	pid_t pid = item->pid->real;
>   	struct vm_area_list vmas;
> @@ -1289,6 +1289,8 @@ static int pre_dump_one_task(struct pstree_item *item)
>   
>   	mdc.pre_dump = true;
>   	mdc.lazy = false;
> +	mdc.stat = NULL;
> +	mdc.parent_se = parent_se;
>   
>   	ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
>   	if (ret)
> @@ -1307,7 +1309,7 @@ static int pre_dump_one_task(struct pstree_item *item)
>   	goto err_free;
>   }
>   
> -static int dump_one_task(struct pstree_item *item)
> +static int dump_one_task(struct pstree_item *item, StatsEntry *parent_se)
>   {
>   	pid_t pid = item->pid->real;
>   	struct vm_area_list vmas;
> @@ -1448,6 +1450,8 @@ static int dump_one_task(struct pstree_item *item)
>   
>   	mdc.pre_dump = false;
>   	mdc.lazy = opts.lazy_pages;
> +	mdc.stat = &pps_buf;
> +	mdc.parent_se = parent_se;
>   
>   	ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
>   	if (ret)
> @@ -1635,6 +1639,7 @@ static int cr_pre_dump_finish(int ret)
>   int cr_pre_dump_tasks(pid_t pid)
>   {
>   	struct pstree_item *item;
> +	StatsEntry *parent_se;
>   	int ret = -1;
>   
>   	if (opts.remote && push_snapshot_id() < 0) {
> @@ -1698,10 +1703,16 @@ int cr_pre_dump_tasks(pid_t pid)
>   	if (collect_namespaces(false) < 0)
>   		goto err;
>   
> +	/* Errors handled later in detect_pid_reuse */
> +	parent_se = get_parent_stats();
> +
>   	for_each_pstree_item(item)
> -		if (pre_dump_one_task(item))
> +		if (pre_dump_one_task(item, parent_se))
>   			goto err;
>   
> +	if (parent_se)
> +		stats_entry__free_unpacked(parent_se, NULL);
> +
>   	ret = cr_dump_shmem();
>   	if (ret)
>   		goto err;
> @@ -1832,6 +1843,7 @@ int cr_dump_tasks(pid_t pid)
>   {
>   	InventoryEntry he = INVENTORY_ENTRY__INIT;
>   	struct pstree_item *item;
> +	StatsEntry *parent_se;
>   	int pre_dump_ret = 0;
>   	int ret = -1;
>   
> @@ -1925,11 +1937,17 @@ int cr_dump_tasks(pid_t pid)
>   	if (collect_seccomp_filters() < 0)
>   		goto err;
>   
> +	/* Errors handled later in detect_pid_reuse */
> +	parent_se = get_parent_stats();
> +
>   	for_each_pstree_item(item) {
> -		if (dump_one_task(item))
> +		if (dump_one_task(item, parent_se))
>   			goto err;
>   	}
>   
> +	if (parent_se)
> +		stats_entry__free_unpacked(parent_se, NULL);
> +
>   	/*
>   	 * It may happen that a process has completed but its files in
>   	 * /proc/PID/ are still open by another process. If the PID has been
> diff --git a/criu/include/mem.h b/criu/include/mem.h
> index bb897c599..e61e175a8 100644
> --- a/criu/include/mem.h
> +++ b/criu/include/mem.h
> @@ -4,6 +4,9 @@
>   #include <stdbool.h>
>   #include "int.h"
>   #include "vma.pb-c.h"
> +#include "pid.h"
> +#include "proc_parse.h"
> +#include "stats.pb-c.h"
>   
>   struct parasite_ctl;
>   struct vm_area_list;
> @@ -12,8 +15,10 @@ struct pstree_item;
>   struct vma_area;
>   
>   struct mem_dump_ctl {
> -	bool	pre_dump;
> -	bool	lazy;
> +	bool			pre_dump;
> +	bool			lazy;
> +	struct proc_pid_stat	*stat;
> +	StatsEntry		*parent_se;
>   };
>   
>   extern bool vma_has_guard_gap_hidden(struct vma_area *vma);
> diff --git a/criu/mem.c b/criu/mem.c
> index 4c6942a11..63df25d54 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -30,9 +30,11 @@
>   #include "fault-injection.h"
>   #include "prctl.h"
>   #include <compel/compel.h>
> +#include "proc_parse.h"
>   
>   #include "protobuf.h"
>   #include "images/pagemap.pb-c.h"
> +#include "images/stats.pb-c.h"
>   
>   static int task_reset_dirty_track(int pid)
>   {
> @@ -290,6 +292,50 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>   	return ret;
>   }
>   
> +static int detect_pid_reuse(struct pstree_item *item,
> +			    struct proc_pid_stat* pps,
> +			    StatsEntry *parent_se)
> +{
> +	struct proc_pid_stat pps_buf;
> +	unsigned long long tps; /* ticks per second */
> +	int ret;
> +
> +	tps = sysconf(_SC_CLK_TCK);
> +	if (tps == -1) {
> +		pr_perror("Failed to get clock ticks via sysconf");
> +		return -1;
> +	}
> +
> +	if (!pps) {
> +		pps = &pps_buf;
> +		ret = parse_pid_stat(item->pid->real, pps);
> +		if (ret < 0)
> +			return -1;
> +	}
> +
> +	if (!parent_se) {
> +		pr_perror("No parent stats, see errors in get_parent_stats");
> +		return -1;
> +	}
> +
> +	if (parent_se->dump->has_dump_uptime) {
> +		unsigned long long dump_ticks;
> +
> +		dump_ticks = parent_se->dump->dump_uptime/(USEC_PER_SEC/tps);
> +
> +		if (pps->start_time >= dump_ticks) {
> +			/* Print "*" if unsure */
> +			pr_warn("Pid reuse%s detected for pid %d\n",
> +				pps_buf.start_time == dump_ticks ? "*" : "",
> +				item->pid->real);
> +			return 1;
> +		}
> +	} else
> +		pr_warn_once("Parent image has no dump timestamp, " \
> +			     "pid reuse detection is OFF!\n");
> +	return 0;
> +}
> +
>   static int __parasite_dump_pages_seized(struct pstree_item *item,
>   		struct parasite_dump_pages_args *args,
>   		struct vm_area_list *vma_area_list,
> @@ -303,6 +349,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>   	int ret = -1;
>   	unsigned cpp_flags = 0;
>   	unsigned long pmc_size;
> +	int possible_pid_reuse = 0;
>   
>   	if (opts.check_only)
>   		return 0;
> @@ -360,6 +407,14 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>   			xfer.parent = NULL + 1;
>   	}
>   
> +	if (xfer.parent) {
> +		possible_pid_reuse = detect_pid_reuse(item, mdc->stat,
> +						      mdc->parent_se);
> +		if (possible_pid_reuse == -1)
> +			goto out_xfer;
> +	}
> +
> +
>   	/*
>   	 * Step 1 -- generate the pagemap
>   	 */
> @@ -386,7 +441,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>   		else {
>   again:
>   			ret = generate_iovs(vma_area, pp, map, &off,
> -				has_parent);
> +				has_parent && !possible_pid_reuse);
>   			if (ret == -EAGAIN) {
>   				BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>   
> diff --git a/criu/stats.c b/criu/stats.c
> index 4823e48de..4d646c104 100644
> --- a/criu/stats.c
> +++ b/criu/stats.c
> @@ -213,7 +213,7 @@ void write_stats(int what)
>   		display_stats(what, &stats);
>   }
>   
> -__maybe_unused StatsEntry *get_parent_stats(void)
> +StatsEntry *get_parent_stats(void)
>   {
>   	struct cr_img *img;
>   	StatsEntry *se;
>