[Devel,RH7,2/2] fence-watchdog: panic after 30 sec of reboot/halt

Submitted by Pavel Tikhomirov on July 19, 2017, 9:03 a.m.

Details

Message ID 20170719090313.4325-2-ptikhomirov@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Pavel Tikhomirov July 19, 2017, 9:03 a.m.
As we do reboot and halt actions in scope of scheduled worker
it can never happen if scheduling does not work properly, so
panic in case that previous action was not successful.

https://jira.sw.ru/browse/PSBM-54747
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 kernel/fence-watchdog.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

Patch hide | download patch | download mbox

diff --git a/kernel/fence-watchdog.c b/kernel/fence-watchdog.c
index 607045a..3ee5b89 100644
--- a/kernel/fence-watchdog.c
+++ b/kernel/fence-watchdog.c
@@ -42,7 +42,14 @@  const char *action_names[] = {"crash", "reboot", "halt", "netfilter", NULL};
 
 DEFINE_VVAR(volatile unsigned long, fence_wdog_jiffies64) = MAX_U64;
 static int fence_wdog_action = FENCE_WDOG_CRASH;
-static atomic_t not_fenced = ATOMIC_INIT(-1);
+
+enum {
+	NOT_FENCED = 0,
+	FENCED = 1,
+	FENCED_TIMEOUT = 2,
+};
+
+static atomic_t fence_stage = ATOMIC_INIT(NOT_FENCED);
 static char fence_wdog_log_path[PATH_MAX] = "/fence_wdog.log";
 
 #define MSG_LEN 32
@@ -114,19 +121,26 @@  static DECLARE_WORK(halt_or_reboot_work, do_halt_or_reboot);
 
 void fence_wdog_do_fence(void)
 {
-	if (fence_wdog_action == FENCE_WDOG_CRASH)
+	if (fence_wdog_action == FENCE_WDOG_CRASH ||
+			atomic_read(&fence_stage) == FENCED_TIMEOUT)
 		panic("fence-watchdog: %s\n",
 		      action_names[fence_wdog_action]);
 	else
 		schedule_work(&halt_or_reboot_work);
 }
 
+#define FENCE_WDOG_TIMEOUT 30
+
 inline int fence_wdog_check_timer(void)
 {
 	if (unlikely(get_jiffies_64() > fence_wdog_jiffies64 &&
 			fence_wdog_action != FENCE_WDOG_NETFILTER)) {
-		if (atomic_inc_not_zero(&not_fenced))
+		if (atomic_cmpxchg(&fence_stage, NOT_FENCED, FENCED) == NOT_FENCED
+		    || (get_jiffies_64() > fence_wdog_jiffies64
+		    + FENCE_WDOG_TIMEOUT * HZ
+		    && atomic_cmpxchg(&fence_stage, FENCED, FENCED_TIMEOUT) == FENCED))
 			fence_wdog_do_fence();
+
 		return 1;
 	}
 

Comments

Pavel Tikhomirov July 19, 2017, 2:08 p.m.
We schedule to system_wq, which has max_active=256, from 
Documentation/workqueue.txt:

@max_active determines the maximum number of execution contexts per
CPU which can be assigned to the work items of a wq.  For example,
with @max_active of 16, at most 16 work items of the wq can be
executing at the same time per CPU.

So for each workqueue we have a pool of processes handling scheduled 
works on it, and sleeping in one process waiting for fsync will still 
allow other works to run.

==
Second thing - I will try use kernel_write instead of just 
file->f_op->write and resend.

On 07/19/2017 12:03 PM, Pavel Tikhomirov wrote:
> As we do reboot and halt actions in scope of scheduled worker
> it can never happen if scheduling does not work properly, so
> panic in case that previous action was not successful.
> 
> https://jira.sw.ru/browse/PSBM-54747
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> ---
>   kernel/fence-watchdog.c | 20 +++++++++++++++++---
>   1 file changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/fence-watchdog.c b/kernel/fence-watchdog.c
> index 607045a..3ee5b89 100644
> --- a/kernel/fence-watchdog.c
> +++ b/kernel/fence-watchdog.c
> @@ -42,7 +42,14 @@ const char *action_names[] = {"crash", "reboot", "halt", "netfilter", NULL};
>   
>   DEFINE_VVAR(volatile unsigned long, fence_wdog_jiffies64) = MAX_U64;
>   static int fence_wdog_action = FENCE_WDOG_CRASH;
> -static atomic_t not_fenced = ATOMIC_INIT(-1);
> +
> +enum {
> +	NOT_FENCED = 0,
> +	FENCED = 1,
> +	FENCED_TIMEOUT = 2,
> +};
> +
> +static atomic_t fence_stage = ATOMIC_INIT(NOT_FENCED);
>   static char fence_wdog_log_path[PATH_MAX] = "/fence_wdog.log";
>   
>   #define MSG_LEN 32
> @@ -114,19 +121,26 @@ static DECLARE_WORK(halt_or_reboot_work, do_halt_or_reboot);
>   
>   void fence_wdog_do_fence(void)
>   {
> -	if (fence_wdog_action == FENCE_WDOG_CRASH)
> +	if (fence_wdog_action == FENCE_WDOG_CRASH ||
> +			atomic_read(&fence_stage) == FENCED_TIMEOUT)
>   		panic("fence-watchdog: %s\n",
>   		      action_names[fence_wdog_action]);
>   	else
>   		schedule_work(&halt_or_reboot_work);
>   }
>   
> +#define FENCE_WDOG_TIMEOUT 30
> +
>   inline int fence_wdog_check_timer(void)
>   {
>   	if (unlikely(get_jiffies_64() > fence_wdog_jiffies64 &&
>   			fence_wdog_action != FENCE_WDOG_NETFILTER)) {
> -		if (atomic_inc_not_zero(&not_fenced))
> +		if (atomic_cmpxchg(&fence_stage, NOT_FENCED, FENCED) == NOT_FENCED
> +		    || (get_jiffies_64() > fence_wdog_jiffies64
> +		    + FENCE_WDOG_TIMEOUT * HZ
> +		    && atomic_cmpxchg(&fence_stage, FENCED, FENCED_TIMEOUT) == FENCED))
>   			fence_wdog_do_fence();
> +
>   		return 1;
>   	}
>   
>