[2/2] rpc: Report back first error message on failure

Submitted by Pavel Emelianov on Aug. 24, 2016, 6:12 p.m.

Details

Message ID 57BDE373.6000108@virtuozzo.com
State Rejected
Series "Bring some verbosity to RPC"
Headers show

Commit Message

Pavel Emelianov Aug. 24, 2016, 6:12 p.m.
When running criu in swrk mode the client typically wants to know
the reason of failure. Right now criu reports back NOTHING but the
fact that dump/restore/etc fails. We've tried to address this by
introducing the cr-errno engine, but it doesn't seem to be informative
enough and is hard to maintain -- adding new errno-s is boring :(

I propose to report back the first message with ERROR level upon
failrure as __typically__ the very first error message indicates
that proceeding is impossible and criu rolls back (generating more
error messages, so it's crucial to know the very first one).

If we ever meet the situation that the first pr_err/pr_perror doesn't
cause criu to exit, this printing should be fixed to be pr_warn.

Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
---
 criu/cr-service.c  |  7 +++++++
 criu/include/log.h |  2 ++
 criu/log.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 images/rpc.proto   |  1 +
 4 files changed, 59 insertions(+)

Patch hide | download patch | download mbox

diff --git a/criu/cr-service.c b/criu/cr-service.c
index a2b4027..c29e6ef 100644
--- a/criu/cr-service.c
+++ b/criu/cr-service.c
@@ -119,6 +119,8 @@  static void set_resp_err(CriuResp *resp)
 		resp->has_cr_errno = true;
 		resp->cr_errno = errn;
 	}
+
+	resp->cr_errmsg = log_first_err();
 }
 
 static void send_criu_err(int sk, char *msg)
@@ -289,6 +291,11 @@  static int setup_opts_from_req(int sk, CriuOpts *req)
 		goto err;
 	}
 
+	if (log_keep_err()) {
+		pr_perror("Can't tune log");
+		goto err;
+	}
+
 	/* checking flags from client */
 	if (req->has_leave_running && req->leave_running)
 		opts.final_state = TASK_ALIVE;
diff --git a/criu/include/log.h b/criu/include/log.h
index fe53a7c..d2b25ec 100644
--- a/criu/include/log.h
+++ b/criu/include/log.h
@@ -9,6 +9,8 @@  extern int log_init(const char *output);
 extern void log_fini(void);
 extern int log_init_by_pid(void);
 extern void log_closedir(void);
+extern int log_keep_err(void);
+extern char *log_first_err(void);
 
 extern void log_set_fd(int fd);
 extern int log_get_fd(void);
diff --git a/criu/log.c b/criu/log.c
index 6c5d807..64ce8c5 100644
--- a/criu/log.c
+++ b/criu/log.c
@@ -17,6 +17,9 @@ 
 #include "util.h"
 #include "cr_options.h"
 #include "servicefd.h"
+#include "rst-malloc.h"
+#include "lock.h"
+#include "string.h"
 
 #define DEFAULT_LOGFD		STDERR_FILENO
 /* Enable timestamps if verbosity is increased from default */
@@ -72,6 +75,49 @@  static void reset_buf_off(void)
 		buf_off = 0;
 }
 
+/*
+ * Keeping the very first error messsage for RPC to report back.
+ */
+struct str_and_lock {
+	mutex_t l;
+	char s[1024];
+};
+
+static struct str_and_lock *first_err;
+
+int log_keep_err(void)
+{
+	first_err = shmalloc(sizeof(struct str_and_lock));
+	if (first_err == NULL)
+		return -1;
+
+	mutex_init(&first_err->l);
+	first_err->s[0] = '\0';
+	return 0;
+}
+
+static void log_note_err(char *msg)
+{
+	if (first_err && first_err->s[0] == '\0') {
+		/*
+		 * In any action other than restore this locking is
+		 * actually not required, but ... it's error path
+		 * anyway, so it doesn't make much sence to try hard
+		 * and optimize this out.
+		 */
+		mutex_lock(&first_err->l);
+		if (first_err->s[0] == '\0')
+			strlcpy(first_err->s, msg, sizeof(first_err->s));
+		mutex_unlock(&first_err->l);
+	}
+}
+
+char *log_first_err(void)
+{
+	BUG_ON(!first_err);
+	return first_err->s[0] == '\0' ? NULL : first_err->s;
+}
+
 int log_init(const char *output)
 {
 	int new_logfd, fd;
@@ -178,6 +224,9 @@  static void __print_on_level(unsigned int loglevel, const char *format, va_list
 		off += ret;
 	}
 	errno =  __errno;
+
+	if (loglevel == LOG_ERROR)
+		log_note_err(buffer + buf_off);
 }
 
 void print_on_level(unsigned int loglevel, const char *format, ...)
diff --git a/images/rpc.proto b/images/rpc.proto
index 11d169b..9b36178 100644
--- a/images/rpc.proto
+++ b/images/rpc.proto
@@ -187,4 +187,5 @@  message criu_resp {
 
 	optional int32			cr_errno	= 7;
 	optional criu_features		features	= 8;
+	optional string			cr_errmsg	= 9;
 }

Comments

Adrian Reber Sept. 5, 2016, 3:19 p.m.
This patch breaks p.haul:

15:18:07.038: 30677: 	Issuing pre-dump command to service
Error (criu/log.c:117): BUG at criu/log.c:117
15:18:08.686: 30677: CRIU RPC error (0/4)
Traceback (most recent call last):
  File "./p.haul", line 47, in <module>
    worker.start_migration()
  File "/share/p.haul/phaul/iters.py", line 159, in start_migration
    self.__start_live_migration()
  File "/share/p.haul/phaul/iters.py", line 197, in __start_live_migration
    self.criu_connection, self.fs)
  File "/share/p.haul/phaul/criu_cr.py", line 14, in criu_predump
    resp = criu_connection.send_req(req)
  File "/share/p.haul/phaul/criu_api.py", line 70, in send_req
    return self._recv_resp()
  File "/share/p.haul/phaul/criu_api.py", line 57, in _recv_resp
    raise Exception("CRIU RPC error (%d/%d)" % (resp.type, self._last_req))
Exception: CRIU RPC error (0/4)

Running p.haul with '--no-pre-dump' does not show that error. Reverting
commit 2157799c also fixes the error.

		Adrian

On Wed, Aug 24, 2016 at 09:12:03PM +0300, Pavel Emelyanov wrote:
> When running criu in swrk mode the client typically wants to know
> the reason of failure. Right now criu reports back NOTHING but the
> fact that dump/restore/etc fails. We've tried to address this by
> introducing the cr-errno engine, but it doesn't seem to be informative
> enough and is hard to maintain -- adding new errno-s is boring :(
> 
> I propose to report back the first message with ERROR level upon
> failrure as __typically__ the very first error message indicates
> that proceeding is impossible and criu rolls back (generating more
> error messages, so it's crucial to know the very first one).
> 
> If we ever meet the situation that the first pr_err/pr_perror doesn't
> cause criu to exit, this printing should be fixed to be pr_warn.
> 
> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
> ---
>  criu/cr-service.c  |  7 +++++++
>  criu/include/log.h |  2 ++
>  criu/log.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  images/rpc.proto   |  1 +
>  4 files changed, 59 insertions(+)
> 
> diff --git a/criu/cr-service.c b/criu/cr-service.c
> index a2b4027..c29e6ef 100644
> --- a/criu/cr-service.c
> +++ b/criu/cr-service.c
> @@ -119,6 +119,8 @@ static void set_resp_err(CriuResp *resp)
>  		resp->has_cr_errno = true;
>  		resp->cr_errno = errn;
>  	}
> +
> +	resp->cr_errmsg = log_first_err();
>  }
>  
>  static void send_criu_err(int sk, char *msg)
> @@ -289,6 +291,11 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
>  		goto err;
>  	}
>  
> +	if (log_keep_err()) {
> +		pr_perror("Can't tune log");
> +		goto err;
> +	}
> +
>  	/* checking flags from client */
>  	if (req->has_leave_running && req->leave_running)
>  		opts.final_state = TASK_ALIVE;
> diff --git a/criu/include/log.h b/criu/include/log.h
> index fe53a7c..d2b25ec 100644
> --- a/criu/include/log.h
> +++ b/criu/include/log.h
> @@ -9,6 +9,8 @@ extern int log_init(const char *output);
>  extern void log_fini(void);
>  extern int log_init_by_pid(void);
>  extern void log_closedir(void);
> +extern int log_keep_err(void);
> +extern char *log_first_err(void);
>  
>  extern void log_set_fd(int fd);
>  extern int log_get_fd(void);
> diff --git a/criu/log.c b/criu/log.c
> index 6c5d807..64ce8c5 100644
> --- a/criu/log.c
> +++ b/criu/log.c
> @@ -17,6 +17,9 @@
>  #include "util.h"
>  #include "cr_options.h"
>  #include "servicefd.h"
> +#include "rst-malloc.h"
> +#include "lock.h"
> +#include "string.h"
>  
>  #define DEFAULT_LOGFD		STDERR_FILENO
>  /* Enable timestamps if verbosity is increased from default */
> @@ -72,6 +75,49 @@ static void reset_buf_off(void)
>  		buf_off = 0;
>  }
>  
> +/*
> + * Keeping the very first error messsage for RPC to report back.
> + */
> +struct str_and_lock {
> +	mutex_t l;
> +	char s[1024];
> +};
> +
> +static struct str_and_lock *first_err;
> +
> +int log_keep_err(void)
> +{
> +	first_err = shmalloc(sizeof(struct str_and_lock));
> +	if (first_err == NULL)
> +		return -1;
> +
> +	mutex_init(&first_err->l);
> +	first_err->s[0] = '\0';
> +	return 0;
> +}
> +
> +static void log_note_err(char *msg)
> +{
> +	if (first_err && first_err->s[0] == '\0') {
> +		/*
> +		 * In any action other than restore this locking is
> +		 * actually not required, but ... it's error path
> +		 * anyway, so it doesn't make much sence to try hard
> +		 * and optimize this out.
> +		 */
> +		mutex_lock(&first_err->l);
> +		if (first_err->s[0] == '\0')
> +			strlcpy(first_err->s, msg, sizeof(first_err->s));
> +		mutex_unlock(&first_err->l);
> +	}
> +}
> +
> +char *log_first_err(void)
> +{
> +	BUG_ON(!first_err);
> +	return first_err->s[0] == '\0' ? NULL : first_err->s;
> +}
> +
>  int log_init(const char *output)
>  {
>  	int new_logfd, fd;
> @@ -178,6 +224,9 @@ static void __print_on_level(unsigned int loglevel, const char *format, va_list
>  		off += ret;
>  	}
>  	errno =  __errno;
> +
> +	if (loglevel == LOG_ERROR)
> +		log_note_err(buffer + buf_off);
>  }
>  
>  void print_on_level(unsigned int loglevel, const char *format, ...)
> diff --git a/images/rpc.proto b/images/rpc.proto
> index 11d169b..9b36178 100644
> --- a/images/rpc.proto
> +++ b/images/rpc.proto
> @@ -187,4 +187,5 @@ message criu_resp {
>  
>  	optional int32			cr_errno	= 7;
>  	optional criu_features		features	= 8;
> +	optional string			cr_errmsg	= 9;
>  }
> -- 
> 2.5.0
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Pavel Emelianov Sept. 6, 2016, 9:40 p.m.
On 09/05/2016 06:19 PM, Adrian Reber wrote:
> This patch breaks p.haul:
> 
> 15:18:07.038: 30677: 	Issuing pre-dump command to service
> Error (criu/log.c:117): BUG at criu/log.c:117

:( That's because of service calls fork() on pre-dump, child initializes
log to keep the message, but the responce is sent in parent process...

Hm...

Thanks for catching this, will think how to fix :)

-- Pavel

> 15:18:08.686: 30677: CRIU RPC error (0/4)
> Traceback (most recent call last):
>   File "./p.haul", line 47, in <module>
>     worker.start_migration()
>   File "/share/p.haul/phaul/iters.py", line 159, in start_migration
>     self.__start_live_migration()
>   File "/share/p.haul/phaul/iters.py", line 197, in __start_live_migration
>     self.criu_connection, self.fs)
>   File "/share/p.haul/phaul/criu_cr.py", line 14, in criu_predump
>     resp = criu_connection.send_req(req)
>   File "/share/p.haul/phaul/criu_api.py", line 70, in send_req
>     return self._recv_resp()
>   File "/share/p.haul/phaul/criu_api.py", line 57, in _recv_resp
>     raise Exception("CRIU RPC error (%d/%d)" % (resp.type, self._last_req))
> Exception: CRIU RPC error (0/4)
> 
> Running p.haul with '--no-pre-dump' does not show that error. Reverting
> commit 2157799c also fixes the error.
> 
> 		Adrian
> 
> On Wed, Aug 24, 2016 at 09:12:03PM +0300, Pavel Emelyanov wrote:
>> When running criu in swrk mode the client typically wants to know
>> the reason of failure. Right now criu reports back NOTHING but the
>> fact that dump/restore/etc fails. We've tried to address this by
>> introducing the cr-errno engine, but it doesn't seem to be informative
>> enough and is hard to maintain -- adding new errno-s is boring :(
>>
>> I propose to report back the first message with ERROR level upon
>> failrure as __typically__ the very first error message indicates
>> that proceeding is impossible and criu rolls back (generating more
>> error messages, so it's crucial to know the very first one).
>>
>> If we ever meet the situation that the first pr_err/pr_perror doesn't
>> cause criu to exit, this printing should be fixed to be pr_warn.
>>
>> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
>> ---
>>  criu/cr-service.c  |  7 +++++++
>>  criu/include/log.h |  2 ++
>>  criu/log.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  images/rpc.proto   |  1 +
>>  4 files changed, 59 insertions(+)
>>
>> diff --git a/criu/cr-service.c b/criu/cr-service.c
>> index a2b4027..c29e6ef 100644
>> --- a/criu/cr-service.c
>> +++ b/criu/cr-service.c
>> @@ -119,6 +119,8 @@ static void set_resp_err(CriuResp *resp)
>>  		resp->has_cr_errno = true;
>>  		resp->cr_errno = errn;
>>  	}
>> +
>> +	resp->cr_errmsg = log_first_err();
>>  }
>>  
>>  static void send_criu_err(int sk, char *msg)
>> @@ -289,6 +291,11 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
>>  		goto err;
>>  	}
>>  
>> +	if (log_keep_err()) {
>> +		pr_perror("Can't tune log");
>> +		goto err;
>> +	}
>> +
>>  	/* checking flags from client */
>>  	if (req->has_leave_running && req->leave_running)
>>  		opts.final_state = TASK_ALIVE;
>> diff --git a/criu/include/log.h b/criu/include/log.h
>> index fe53a7c..d2b25ec 100644
>> --- a/criu/include/log.h
>> +++ b/criu/include/log.h
>> @@ -9,6 +9,8 @@ extern int log_init(const char *output);
>>  extern void log_fini(void);
>>  extern int log_init_by_pid(void);
>>  extern void log_closedir(void);
>> +extern int log_keep_err(void);
>> +extern char *log_first_err(void);
>>  
>>  extern void log_set_fd(int fd);
>>  extern int log_get_fd(void);
>> diff --git a/criu/log.c b/criu/log.c
>> index 6c5d807..64ce8c5 100644
>> --- a/criu/log.c
>> +++ b/criu/log.c
>> @@ -17,6 +17,9 @@
>>  #include "util.h"
>>  #include "cr_options.h"
>>  #include "servicefd.h"
>> +#include "rst-malloc.h"
>> +#include "lock.h"
>> +#include "string.h"
>>  
>>  #define DEFAULT_LOGFD		STDERR_FILENO
>>  /* Enable timestamps if verbosity is increased from default */
>> @@ -72,6 +75,49 @@ static void reset_buf_off(void)
>>  		buf_off = 0;
>>  }
>>  
>> +/*
>> + * Keeping the very first error messsage for RPC to report back.
>> + */
>> +struct str_and_lock {
>> +	mutex_t l;
>> +	char s[1024];
>> +};
>> +
>> +static struct str_and_lock *first_err;
>> +
>> +int log_keep_err(void)
>> +{
>> +	first_err = shmalloc(sizeof(struct str_and_lock));
>> +	if (first_err == NULL)
>> +		return -1;
>> +
>> +	mutex_init(&first_err->l);
>> +	first_err->s[0] = '\0';
>> +	return 0;
>> +}
>> +
>> +static void log_note_err(char *msg)
>> +{
>> +	if (first_err && first_err->s[0] == '\0') {
>> +		/*
>> +		 * In any action other than restore this locking is
>> +		 * actually not required, but ... it's error path
>> +		 * anyway, so it doesn't make much sence to try hard
>> +		 * and optimize this out.
>> +		 */
>> +		mutex_lock(&first_err->l);
>> +		if (first_err->s[0] == '\0')
>> +			strlcpy(first_err->s, msg, sizeof(first_err->s));
>> +		mutex_unlock(&first_err->l);
>> +	}
>> +}
>> +
>> +char *log_first_err(void)
>> +{
>> +	BUG_ON(!first_err);
>> +	return first_err->s[0] == '\0' ? NULL : first_err->s;
>> +}
>> +
>>  int log_init(const char *output)
>>  {
>>  	int new_logfd, fd;
>> @@ -178,6 +224,9 @@ static void __print_on_level(unsigned int loglevel, const char *format, va_list
>>  		off += ret;
>>  	}
>>  	errno =  __errno;
>> +
>> +	if (loglevel == LOG_ERROR)
>> +		log_note_err(buffer + buf_off);
>>  }
>>  
>>  void print_on_level(unsigned int loglevel, const char *format, ...)
>> diff --git a/images/rpc.proto b/images/rpc.proto
>> index 11d169b..9b36178 100644
>> --- a/images/rpc.proto
>> +++ b/images/rpc.proto
>> @@ -187,4 +187,5 @@ message criu_resp {
>>  
>>  	optional int32			cr_errno	= 7;
>>  	optional criu_features		features	= 8;
>> +	optional string			cr_errmsg	= 9;
>>  }
>> -- 
>> 2.5.0
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU@openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
> .
>