[Devel] sched/numa: Fix use-after-free bug in the task_numa_compare

Submitted by Kirill Tkhai on April 13, 2017, 10:09 a.m.

Details

Message ID 149207810119.32507.6506683695489060804.stgit@localhost.localdomain
State New
Series "sched/numa: Fix use-after-free bug in the task_numa_compare"
Headers show

Commit Message

Kirill Tkhai April 13, 2017, 10:09 a.m.
ms commit 1dff76b92f69 by Gavin Guo <gavin.guo@canonical.com>

The following message can be observed on the Ubuntu v3.13.0-65 with KASan
backported:

  ==================================================================
  BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8
  Read of size 8 by task qemu-system-x86/3998900
  =============================================================================
  BUG kmalloc-128 (Tainted: G    B        ): kasan: bad access detected
  -----------------------------------------------------------------------------

  INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
        __slab_alloc+0x4f8/0x560
        __kmalloc+0x1eb/0x280
        task_numa_fault+0xc1b/0xed0
        do_numa_page+0x192/0x200
        handle_mm_fault+0x808/0x1160
        __do_page_fault+0x218/0x750
        do_page_fault+0x1a/0x70
        page_fault+0x28/0x30
        SyS_poll+0x66/0x1a0
        system_call_fastpath+0x1a/0x1f
  INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
        __slab_free+0x2ab/0x3f0
        kfree+0x161/0x170
        task_numa_free+0x1d2/0x200
        finish_task_switch+0x1d2/0x210
        __schedule+0x5d4/0xc60
        schedule_preempt_disabled+0x40/0xc0
        cpu_startup_entry+0x2da/0x340
        start_secondary+0x28f/0x360
  Call Trace:
   [<ffffffff81a6ce35>] dump_stack+0x45/0x56
   [<ffffffff81244aed>] print_trailer+0xfd/0x170
   [<ffffffff8124ac36>] object_err+0x36/0x40
   [<ffffffff8124cbf9>] kasan_report_error+0x1e9/0x3a0
   [<ffffffff8124d260>] kasan_report+0x40/0x50
   [<ffffffff810dda7c>] ? task_numa_find_cpu+0x64c/0x890
   [<ffffffff8124bee9>] __asan_load8+0x69/0xa0
   [<ffffffff814f5c38>] ? find_next_bit+0xd8/0x120
   [<ffffffff810dda7c>] task_numa_find_cpu+0x64c/0x890
   [<ffffffff810de16c>] task_numa_migrate+0x4ac/0x7b0
   [<ffffffff810de523>] numa_migrate_preferred+0xb3/0xc0
   [<ffffffff810e0b88>] task_numa_fault+0xb88/0xed0
   [<ffffffff8120ef02>] do_numa_page+0x192/0x200
   [<ffffffff81211038>] handle_mm_fault+0x808/0x1160
   [<ffffffff810d7dbd>] ? sched_clock_cpu+0x10d/0x160
   [<ffffffff81068c52>] ? native_load_tls+0x82/0xa0
   [<ffffffff81a7bd68>] __do_page_fault+0x218/0x750
   [<ffffffff810c2186>] ? hrtimer_try_to_cancel+0x76/0x160
   [<ffffffff81a6f5e7>] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
   [<ffffffff81a7c2ba>] do_page_fault+0x1a/0x70
   [<ffffffff81a772e8>] page_fault+0x28/0x30
   [<ffffffff8128cbd4>] ? do_sys_poll+0x1c4/0x6d0
   [<ffffffff810e64f6>] ? enqueue_task_fair+0x4b6/0xaa0
   [<ffffffff810233c9>] ? sched_clock+0x9/0x10
   [<ffffffff810cf70a>] ? resched_task+0x7a/0xc0
   [<ffffffff810d0663>] ? check_preempt_curr+0xb3/0x130
   [<ffffffff8128b5c0>] ? poll_select_copy_remaining+0x170/0x170
   [<ffffffff810d3bc0>] ? wake_up_state+0x10/0x20
   [<ffffffff8112a28f>] ? drop_futex_key_refs.isra.14+0x1f/0x90
   [<ffffffff8112d40e>] ? futex_requeue+0x3de/0xba0
   [<ffffffff8112e49e>] ? do_futex+0xbe/0x8f0
   [<ffffffff81022c89>] ? read_tsc+0x9/0x20
   [<ffffffff8111bd9d>] ? ktime_get_ts+0x12d/0x170
   [<ffffffff8108f699>] ? timespec_add_safe+0x59/0xe0
   [<ffffffff8128d1f6>] SyS_poll+0x66/0x1a0
   [<ffffffff81a830dd>] system_call_fastpath+0x1a/0x1f

As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
task_numa_assign()") points out, the rcu_read_lock() cannot protect the
task_struct from being freed in the finish_task_switch(). And the bug
happens in the process of calculation of imp which requires the access of
p->numa_faults being freed in the following path:

do_exit()
        current->flags |= PF_EXITING;
    release_task()
        ~~delayed_put_task_struct()~~
    schedule()
    ...
    ...
rq->curr = next;
    context_switch()
        finish_task_switch()
            put_task_struct()
                __put_task_struct()
                    task_numa_free()

The fix here to get_task_struct() early before end of dst_rq->lock to
protect the calculation process and also put_task_struct() in the
corresponding point if finally the dst_rq->curr somehow cannot be
assigned.

Additional credit to Liang Chen who helped fix the error logic and add the
put_task_struct() to the place it missed.

Signed-off-by: Gavin Guo <gavin.guo@canonical.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jay.vosburgh@canonical.com
Cc: liang.chen@canonical.com
Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

In scope of https://jira.sw.ru/browse/PSBM-62208

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 kernel/sched/fair.c |   29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

Patch hide | download patch | download mbox

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index daf709e40b6..4fde0d42a95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1388,8 +1388,6 @@  static void task_numa_assign(struct task_numa_env *env,
 {
 	if (env->best_task)
 		put_task_struct(env->best_task);
-	if (p)
-		get_task_struct(p);
 
 	env->best_task = p;
 	env->best_imp = imp;
@@ -1411,19 +1409,29 @@  static void task_numa_compare(struct task_numa_env *env,
 	long dst_load, src_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
+	bool assigned = false;
 
 	rcu_read_lock();
 	raw_spin_lock_irq(&dst_rq->lock);
 	cur = dst_rq->curr;
 	/*
-	 * No need to move the exiting task, and this ensures that ->curr
-	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
-	 * is safe under RCU read lock.
-	 * Note that rcu_read_lock() itself can't protect from the final
-	 * put_task_struct() after the last schedule().
+	 * No need to move the exiting task or idle task.
 	 */
 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
 		cur = NULL;
+	else {
+		/*
+		 * The task_struct must be protected here to protect the
+		 * p->numa_faults access in the task_weight since the
+		 * numa_faults could already be freed in the following path:
+		 * finish_task_switch()
+		 *     --> put_task_struct()
+		 *         --> __put_task_struct()
+		 *             --> task_numa_free()
+		 */
+		get_task_struct(cur);
+	}
+
 	raw_spin_unlock_irq(&dst_rq->lock);
 
 	/*
@@ -1513,9 +1521,16 @@  static void task_numa_compare(struct task_numa_env *env,
 		goto unlock;
 
 assign:
+	assigned = true;
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
+	/*
+	 * The dst_rq->curr isn't assigned. The protection for task_struct is
+	 * finished.
+	 */
+	if (cur && !assigned)
+		put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,

Comments

Konstantin Khorenko April 13, 2017, 10:58 a.m.
May be apply following patch as well?
It mostly reverts your patch back.

commit bac7857319bcf7fed329a10bb760053e761115c0
Author: Oleg Nesterov <oleg@redhat.com>
Date:   Wed May 18 21:57:33 2016 +0200

     sched/fair: Use task_rcu_dereference()

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 04/13/2017 01:09 PM, Kirill Tkhai wrote:
> ms commit 1dff76b92f69 by Gavin Guo <gavin.guo@canonical.com>
>
> The following message can be observed on the Ubuntu v3.13.0-65 with KASan
> backported:
>
>   ==================================================================
>   BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8
>   Read of size 8 by task qemu-system-x86/3998900
>   =============================================================================
>   BUG kmalloc-128 (Tainted: G    B        ): kasan: bad access detected
>   -----------------------------------------------------------------------------
>
>   INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
>         __slab_alloc+0x4f8/0x560
>         __kmalloc+0x1eb/0x280
>         task_numa_fault+0xc1b/0xed0
>         do_numa_page+0x192/0x200
>         handle_mm_fault+0x808/0x1160
>         __do_page_fault+0x218/0x750
>         do_page_fault+0x1a/0x70
>         page_fault+0x28/0x30
>         SyS_poll+0x66/0x1a0
>         system_call_fastpath+0x1a/0x1f
>   INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
>         __slab_free+0x2ab/0x3f0
>         kfree+0x161/0x170
>         task_numa_free+0x1d2/0x200
>         finish_task_switch+0x1d2/0x210
>         __schedule+0x5d4/0xc60
>         schedule_preempt_disabled+0x40/0xc0
>         cpu_startup_entry+0x2da/0x340
>         start_secondary+0x28f/0x360
>   Call Trace:
>    [<ffffffff81a6ce35>] dump_stack+0x45/0x56
>    [<ffffffff81244aed>] print_trailer+0xfd/0x170
>    [<ffffffff8124ac36>] object_err+0x36/0x40
>    [<ffffffff8124cbf9>] kasan_report_error+0x1e9/0x3a0
>    [<ffffffff8124d260>] kasan_report+0x40/0x50
>    [<ffffffff810dda7c>] ? task_numa_find_cpu+0x64c/0x890
>    [<ffffffff8124bee9>] __asan_load8+0x69/0xa0
>    [<ffffffff814f5c38>] ? find_next_bit+0xd8/0x120
>    [<ffffffff810dda7c>] task_numa_find_cpu+0x64c/0x890
>    [<ffffffff810de16c>] task_numa_migrate+0x4ac/0x7b0
>    [<ffffffff810de523>] numa_migrate_preferred+0xb3/0xc0
>    [<ffffffff810e0b88>] task_numa_fault+0xb88/0xed0
>    [<ffffffff8120ef02>] do_numa_page+0x192/0x200
>    [<ffffffff81211038>] handle_mm_fault+0x808/0x1160
>    [<ffffffff810d7dbd>] ? sched_clock_cpu+0x10d/0x160
>    [<ffffffff81068c52>] ? native_load_tls+0x82/0xa0
>    [<ffffffff81a7bd68>] __do_page_fault+0x218/0x750
>    [<ffffffff810c2186>] ? hrtimer_try_to_cancel+0x76/0x160
>    [<ffffffff81a6f5e7>] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
>    [<ffffffff81a7c2ba>] do_page_fault+0x1a/0x70
>    [<ffffffff81a772e8>] page_fault+0x28/0x30
>    [<ffffffff8128cbd4>] ? do_sys_poll+0x1c4/0x6d0
>    [<ffffffff810e64f6>] ? enqueue_task_fair+0x4b6/0xaa0
>    [<ffffffff810233c9>] ? sched_clock+0x9/0x10
>    [<ffffffff810cf70a>] ? resched_task+0x7a/0xc0
>    [<ffffffff810d0663>] ? check_preempt_curr+0xb3/0x130
>    [<ffffffff8128b5c0>] ? poll_select_copy_remaining+0x170/0x170
>    [<ffffffff810d3bc0>] ? wake_up_state+0x10/0x20
>    [<ffffffff8112a28f>] ? drop_futex_key_refs.isra.14+0x1f/0x90
>    [<ffffffff8112d40e>] ? futex_requeue+0x3de/0xba0
>    [<ffffffff8112e49e>] ? do_futex+0xbe/0x8f0
>    [<ffffffff81022c89>] ? read_tsc+0x9/0x20
>    [<ffffffff8111bd9d>] ? ktime_get_ts+0x12d/0x170
>    [<ffffffff8108f699>] ? timespec_add_safe+0x59/0xe0
>    [<ffffffff8128d1f6>] SyS_poll+0x66/0x1a0
>    [<ffffffff81a830dd>] system_call_fastpath+0x1a/0x1f
>
> As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
> task_numa_assign()") points out, the rcu_read_lock() cannot protect the
> task_struct from being freed in the finish_task_switch(). And the bug
> happens in the process of calculation of imp which requires the access of
> p->numa_faults being freed in the following path:
>
> do_exit()
>         current->flags |= PF_EXITING;
>     release_task()
>         ~~delayed_put_task_struct()~~
>     schedule()
>     ...
>     ...
> rq->curr = next;
>     context_switch()
>         finish_task_switch()
>             put_task_struct()
>                 __put_task_struct()
>                     task_numa_free()
>
> The fix here to get_task_struct() early before end of dst_rq->lock to
> protect the calculation process and also put_task_struct() in the
> corresponding point if finally the dst_rq->curr somehow cannot be
> assigned.
>
> Additional credit to Liang Chen who helped fix the error logic and add the
> put_task_struct() to the place it missed.
>
> Signed-off-by: Gavin Guo <gavin.guo@canonical.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: jay.vosburgh@canonical.com
> Cc: liang.chen@canonical.com
> Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
>
> In scope of https://jira.sw.ru/browse/PSBM-62208
>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  kernel/sched/fair.c |   29 ++++++++++++++++++++++-------
>  1 file changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index daf709e40b6..4fde0d42a95 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1388,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
>  {
>  	if (env->best_task)
>  		put_task_struct(env->best_task);
> -	if (p)
> -		get_task_struct(p);
>
>  	env->best_task = p;
>  	env->best_imp = imp;
> @@ -1411,19 +1409,29 @@ static void task_numa_compare(struct task_numa_env *env,
>  	long dst_load, src_load;
>  	long load;
>  	long imp = (groupimp > 0) ? groupimp : taskimp;
> +	bool assigned = false;
>
>  	rcu_read_lock();
>  	raw_spin_lock_irq(&dst_rq->lock);
>  	cur = dst_rq->curr;
>  	/*
> -	 * No need to move the exiting task, and this ensures that ->curr
> -	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
> -	 * is safe under RCU read lock.
> -	 * Note that rcu_read_lock() itself can't protect from the final
> -	 * put_task_struct() after the last schedule().
> +	 * No need to move the exiting task or idle task.
>  	 */
>  	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
>  		cur = NULL;
> +	else {
> +		/*
> +		 * The task_struct must be protected here to protect the
> +		 * p->numa_faults access in the task_weight since the
> +		 * numa_faults could already be freed in the following path:
> +		 * finish_task_switch()
> +		 *     --> put_task_struct()
> +		 *         --> __put_task_struct()
> +		 *             --> task_numa_free()
> +		 */
> +		get_task_struct(cur);
> +	}
> +
>  	raw_spin_unlock_irq(&dst_rq->lock);
>
>  	/*
> @@ -1513,9 +1521,16 @@ static void task_numa_compare(struct task_numa_env *env,
>  		goto unlock;
>
>  assign:
> +	assigned = true;
>  	task_numa_assign(env, cur, imp);
>  unlock:
>  	rcu_read_unlock();
> +	/*
> +	 * The dst_rq->curr isn't assigned. The protection for task_struct is
> +	 * finished.
> +	 */
> +	if (cur && !assigned)
> +		put_task_struct(cur);
>  }
>
>  static void task_numa_find_cpu(struct task_numa_env *env,
>
> .
>
Kirill Tkhai April 13, 2017, 11 a.m.
On 13.04.2017 13:58, Konstantin Khorenko wrote:
> May be apply following patch as well?
> It mostly reverts your patch back.

It has a patch-dependence, so I didn't choose it. Which way you prefer?
 
> commit bac7857319bcf7fed329a10bb760053e761115c0
> Author: Oleg Nesterov <oleg@redhat.com>
> Date:   Wed May 18 21:57:33 2016 +0200
> 
>     sched/fair: Use task_rcu_dereference()
> 
> -- 
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
> 
> On 04/13/2017 01:09 PM, Kirill Tkhai wrote:
>> ms commit 1dff76b92f69 by Gavin Guo <gavin.guo@canonical.com>
>>
>> The following message can be observed on the Ubuntu v3.13.0-65 with KASan
>> backported:
>>
>>   ==================================================================
>>   BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8
>>   Read of size 8 by task qemu-system-x86/3998900
>>   =============================================================================
>>   BUG kmalloc-128 (Tainted: G    B        ): kasan: bad access detected
>>   -----------------------------------------------------------------------------
>>
>>   INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
>>         __slab_alloc+0x4f8/0x560
>>         __kmalloc+0x1eb/0x280
>>         task_numa_fault+0xc1b/0xed0
>>         do_numa_page+0x192/0x200
>>         handle_mm_fault+0x808/0x1160
>>         __do_page_fault+0x218/0x750
>>         do_page_fault+0x1a/0x70
>>         page_fault+0x28/0x30
>>         SyS_poll+0x66/0x1a0
>>         system_call_fastpath+0x1a/0x1f
>>   INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
>>         __slab_free+0x2ab/0x3f0
>>         kfree+0x161/0x170
>>         task_numa_free+0x1d2/0x200
>>         finish_task_switch+0x1d2/0x210
>>         __schedule+0x5d4/0xc60
>>         schedule_preempt_disabled+0x40/0xc0
>>         cpu_startup_entry+0x2da/0x340
>>         start_secondary+0x28f/0x360
>>   Call Trace:
>>    [<ffffffff81a6ce35>] dump_stack+0x45/0x56
>>    [<ffffffff81244aed>] print_trailer+0xfd/0x170
>>    [<ffffffff8124ac36>] object_err+0x36/0x40
>>    [<ffffffff8124cbf9>] kasan_report_error+0x1e9/0x3a0
>>    [<ffffffff8124d260>] kasan_report+0x40/0x50
>>    [<ffffffff810dda7c>] ? task_numa_find_cpu+0x64c/0x890
>>    [<ffffffff8124bee9>] __asan_load8+0x69/0xa0
>>    [<ffffffff814f5c38>] ? find_next_bit+0xd8/0x120
>>    [<ffffffff810dda7c>] task_numa_find_cpu+0x64c/0x890
>>    [<ffffffff810de16c>] task_numa_migrate+0x4ac/0x7b0
>>    [<ffffffff810de523>] numa_migrate_preferred+0xb3/0xc0
>>    [<ffffffff810e0b88>] task_numa_fault+0xb88/0xed0
>>    [<ffffffff8120ef02>] do_numa_page+0x192/0x200
>>    [<ffffffff81211038>] handle_mm_fault+0x808/0x1160
>>    [<ffffffff810d7dbd>] ? sched_clock_cpu+0x10d/0x160
>>    [<ffffffff81068c52>] ? native_load_tls+0x82/0xa0
>>    [<ffffffff81a7bd68>] __do_page_fault+0x218/0x750
>>    [<ffffffff810c2186>] ? hrtimer_try_to_cancel+0x76/0x160
>>    [<ffffffff81a6f5e7>] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
>>    [<ffffffff81a7c2ba>] do_page_fault+0x1a/0x70
>>    [<ffffffff81a772e8>] page_fault+0x28/0x30
>>    [<ffffffff8128cbd4>] ? do_sys_poll+0x1c4/0x6d0
>>    [<ffffffff810e64f6>] ? enqueue_task_fair+0x4b6/0xaa0
>>    [<ffffffff810233c9>] ? sched_clock+0x9/0x10
>>    [<ffffffff810cf70a>] ? resched_task+0x7a/0xc0
>>    [<ffffffff810d0663>] ? check_preempt_curr+0xb3/0x130
>>    [<ffffffff8128b5c0>] ? poll_select_copy_remaining+0x170/0x170
>>    [<ffffffff810d3bc0>] ? wake_up_state+0x10/0x20
>>    [<ffffffff8112a28f>] ? drop_futex_key_refs.isra.14+0x1f/0x90
>>    [<ffffffff8112d40e>] ? futex_requeue+0x3de/0xba0
>>    [<ffffffff8112e49e>] ? do_futex+0xbe/0x8f0
>>    [<ffffffff81022c89>] ? read_tsc+0x9/0x20
>>    [<ffffffff8111bd9d>] ? ktime_get_ts+0x12d/0x170
>>    [<ffffffff8108f699>] ? timespec_add_safe+0x59/0xe0
>>    [<ffffffff8128d1f6>] SyS_poll+0x66/0x1a0
>>    [<ffffffff81a830dd>] system_call_fastpath+0x1a/0x1f
>>
>> As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
>> task_numa_assign()") points out, the rcu_read_lock() cannot protect the
>> task_struct from being freed in the finish_task_switch(). And the bug
>> happens in the process of calculation of imp which requires the access of
>> p->numa_faults being freed in the following path:
>>
>> do_exit()
>>         current->flags |= PF_EXITING;
>>     release_task()
>>         ~~delayed_put_task_struct()~~
>>     schedule()
>>     ...
>>     ...
>> rq->curr = next;
>>     context_switch()
>>         finish_task_switch()
>>             put_task_struct()
>>                 __put_task_struct()
>>                     task_numa_free()
>>
>> The fix here to get_task_struct() early before end of dst_rq->lock to
>> protect the calculation process and also put_task_struct() in the
>> corresponding point if finally the dst_rq->curr somehow cannot be
>> assigned.
>>
>> Additional credit to Liang Chen who helped fix the error logic and add the
>> put_task_struct() to the place it missed.
>>
>> Signed-off-by: Gavin Guo <gavin.guo@canonical.com>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Linus Torvalds <torvalds@linux-foundation.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Rik van Riel <riel@redhat.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: jay.vosburgh@canonical.com
>> Cc: liang.chen@canonical.com
>> Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com
>> Signed-off-by: Ingo Molnar <mingo@kernel.org>
>>
>> In scope of https://jira.sw.ru/browse/PSBM-62208
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  kernel/sched/fair.c |   29 ++++++++++++++++++++++-------
>>  1 file changed, 22 insertions(+), 7 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index daf709e40b6..4fde0d42a95 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1388,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
>>  {
>>      if (env->best_task)
>>          put_task_struct(env->best_task);
>> -    if (p)
>> -        get_task_struct(p);
>>
>>      env->best_task = p;
>>      env->best_imp = imp;
>> @@ -1411,19 +1409,29 @@ static void task_numa_compare(struct task_numa_env *env,
>>      long dst_load, src_load;
>>      long load;
>>      long imp = (groupimp > 0) ? groupimp : taskimp;
>> +    bool assigned = false;
>>
>>      rcu_read_lock();
>>      raw_spin_lock_irq(&dst_rq->lock);
>>      cur = dst_rq->curr;
>>      /*
>> -     * No need to move the exiting task, and this ensures that ->curr
>> -     * wasn't reaped and thus get_task_struct() in task_numa_assign()
>> -     * is safe under RCU read lock.
>> -     * Note that rcu_read_lock() itself can't protect from the final
>> -     * put_task_struct() after the last schedule().
>> +     * No need to move the exiting task or idle task.
>>       */
>>      if ((cur->flags & PF_EXITING) || is_idle_task(cur))
>>          cur = NULL;
>> +    else {
>> +        /*
>> +         * The task_struct must be protected here to protect the
>> +         * p->numa_faults access in the task_weight since the
>> +         * numa_faults could already be freed in the following path:
>> +         * finish_task_switch()
>> +         *     --> put_task_struct()
>> +         *         --> __put_task_struct()
>> +         *             --> task_numa_free()
>> +         */
>> +        get_task_struct(cur);
>> +    }
>> +
>>      raw_spin_unlock_irq(&dst_rq->lock);
>>
>>      /*
>> @@ -1513,9 +1521,16 @@ static void task_numa_compare(struct task_numa_env *env,
>>          goto unlock;
>>
>>  assign:
>> +    assigned = true;
>>      task_numa_assign(env, cur, imp);
>>  unlock:
>>      rcu_read_unlock();
>> +    /*
>> +     * The dst_rq->curr isn't assigned. The protection for task_struct is
>> +     * finished.
>> +     */
>> +    if (cur && !assigned)
>> +        put_task_struct(cur);
>>  }
>>
>>  static void task_numa_find_cpu(struct task_numa_env *env,
>>
>> .
>>
Konstantin Khorenko April 13, 2017, 11:01 a.m.
On 04/13/2017 01:58 PM, Konstantin Khorenko wrote:
> May be apply following patch as well?
Surely with preparation patch as well:

150593b sched/api: Introduce task_rcu_dereference() and try_get_task_struct()

> It mostly reverts your patch back.
>
> commit bac7857319bcf7fed329a10bb760053e761115c0
> Author: Oleg Nesterov <oleg@redhat.com>
> Date:   Wed May 18 21:57:33 2016 +0200
>
>      sched/fair: Use task_rcu_dereference()
>
> --
> Best regards,
>
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
>
> On 04/13/2017 01:09 PM, Kirill Tkhai wrote:
>> ms commit 1dff76b92f69 by Gavin Guo <gavin.guo@canonical.com>
>>
>> The following message can be observed on the Ubuntu v3.13.0-65 with KASan
>> backported:
>>
>>   ==================================================================
>>   BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8
>>   Read of size 8 by task qemu-system-x86/3998900
>>   =============================================================================
>>   BUG kmalloc-128 (Tainted: G    B        ): kasan: bad access detected
>>   -----------------------------------------------------------------------------
>>
>>   INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
>>         __slab_alloc+0x4f8/0x560
>>         __kmalloc+0x1eb/0x280
>>         task_numa_fault+0xc1b/0xed0
>>         do_numa_page+0x192/0x200
>>         handle_mm_fault+0x808/0x1160
>>         __do_page_fault+0x218/0x750
>>         do_page_fault+0x1a/0x70
>>         page_fault+0x28/0x30
>>         SyS_poll+0x66/0x1a0
>>         system_call_fastpath+0x1a/0x1f
>>   INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
>>         __slab_free+0x2ab/0x3f0
>>         kfree+0x161/0x170
>>         task_numa_free+0x1d2/0x200
>>         finish_task_switch+0x1d2/0x210
>>         __schedule+0x5d4/0xc60
>>         schedule_preempt_disabled+0x40/0xc0
>>         cpu_startup_entry+0x2da/0x340
>>         start_secondary+0x28f/0x360
>>   Call Trace:
>>    [<ffffffff81a6ce35>] dump_stack+0x45/0x56
>>    [<ffffffff81244aed>] print_trailer+0xfd/0x170
>>    [<ffffffff8124ac36>] object_err+0x36/0x40
>>    [<ffffffff8124cbf9>] kasan_report_error+0x1e9/0x3a0
>>    [<ffffffff8124d260>] kasan_report+0x40/0x50
>>    [<ffffffff810dda7c>] ? task_numa_find_cpu+0x64c/0x890
>>    [<ffffffff8124bee9>] __asan_load8+0x69/0xa0
>>    [<ffffffff814f5c38>] ? find_next_bit+0xd8/0x120
>>    [<ffffffff810dda7c>] task_numa_find_cpu+0x64c/0x890
>>    [<ffffffff810de16c>] task_numa_migrate+0x4ac/0x7b0
>>    [<ffffffff810de523>] numa_migrate_preferred+0xb3/0xc0
>>    [<ffffffff810e0b88>] task_numa_fault+0xb88/0xed0
>>    [<ffffffff8120ef02>] do_numa_page+0x192/0x200
>>    [<ffffffff81211038>] handle_mm_fault+0x808/0x1160
>>    [<ffffffff810d7dbd>] ? sched_clock_cpu+0x10d/0x160
>>    [<ffffffff81068c52>] ? native_load_tls+0x82/0xa0
>>    [<ffffffff81a7bd68>] __do_page_fault+0x218/0x750
>>    [<ffffffff810c2186>] ? hrtimer_try_to_cancel+0x76/0x160
>>    [<ffffffff81a6f5e7>] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
>>    [<ffffffff81a7c2ba>] do_page_fault+0x1a/0x70
>>    [<ffffffff81a772e8>] page_fault+0x28/0x30
>>    [<ffffffff8128cbd4>] ? do_sys_poll+0x1c4/0x6d0
>>    [<ffffffff810e64f6>] ? enqueue_task_fair+0x4b6/0xaa0
>>    [<ffffffff810233c9>] ? sched_clock+0x9/0x10
>>    [<ffffffff810cf70a>] ? resched_task+0x7a/0xc0
>>    [<ffffffff810d0663>] ? check_preempt_curr+0xb3/0x130
>>    [<ffffffff8128b5c0>] ? poll_select_copy_remaining+0x170/0x170
>>    [<ffffffff810d3bc0>] ? wake_up_state+0x10/0x20
>>    [<ffffffff8112a28f>] ? drop_futex_key_refs.isra.14+0x1f/0x90
>>    [<ffffffff8112d40e>] ? futex_requeue+0x3de/0xba0
>>    [<ffffffff8112e49e>] ? do_futex+0xbe/0x8f0
>>    [<ffffffff81022c89>] ? read_tsc+0x9/0x20
>>    [<ffffffff8111bd9d>] ? ktime_get_ts+0x12d/0x170
>>    [<ffffffff8108f699>] ? timespec_add_safe+0x59/0xe0
>>    [<ffffffff8128d1f6>] SyS_poll+0x66/0x1a0
>>    [<ffffffff81a830dd>] system_call_fastpath+0x1a/0x1f
>>
>> As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
>> task_numa_assign()") points out, the rcu_read_lock() cannot protect the
>> task_struct from being freed in the finish_task_switch(). And the bug
>> happens in the process of calculation of imp which requires the access of
>> p->numa_faults being freed in the following path:
>>
>> do_exit()
>>         current->flags |= PF_EXITING;
>>     release_task()
>>         ~~delayed_put_task_struct()~~
>>     schedule()
>>     ...
>>     ...
>> rq->curr = next;
>>     context_switch()
>>         finish_task_switch()
>>             put_task_struct()
>>                 __put_task_struct()
>>                     task_numa_free()
>>
>> The fix here to get_task_struct() early before end of dst_rq->lock to
>> protect the calculation process and also put_task_struct() in the
>> corresponding point if finally the dst_rq->curr somehow cannot be
>> assigned.
>>
>> Additional credit to Liang Chen who helped fix the error logic and add the
>> put_task_struct() to the place it missed.
>>
>> Signed-off-by: Gavin Guo <gavin.guo@canonical.com>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Linus Torvalds <torvalds@linux-foundation.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Rik van Riel <riel@redhat.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: jay.vosburgh@canonical.com
>> Cc: liang.chen@canonical.com
>> Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com
>> Signed-off-by: Ingo Molnar <mingo@kernel.org>
>>
>> In scope of https://jira.sw.ru/browse/PSBM-62208
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  kernel/sched/fair.c |   29 ++++++++++++++++++++++-------
>>  1 file changed, 22 insertions(+), 7 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index daf709e40b6..4fde0d42a95 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1388,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
>>  {
>>  	if (env->best_task)
>>  		put_task_struct(env->best_task);
>> -	if (p)
>> -		get_task_struct(p);
>>
>>  	env->best_task = p;
>>  	env->best_imp = imp;
>> @@ -1411,19 +1409,29 @@ static void task_numa_compare(struct task_numa_env *env,
>>  	long dst_load, src_load;
>>  	long load;
>>  	long imp = (groupimp > 0) ? groupimp : taskimp;
>> +	bool assigned = false;
>>
>>  	rcu_read_lock();
>>  	raw_spin_lock_irq(&dst_rq->lock);
>>  	cur = dst_rq->curr;
>>  	/*
>> -	 * No need to move the exiting task, and this ensures that ->curr
>> -	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
>> -	 * is safe under RCU read lock.
>> -	 * Note that rcu_read_lock() itself can't protect from the final
>> -	 * put_task_struct() after the last schedule().
>> +	 * No need to move the exiting task or idle task.
>>  	 */
>>  	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
>>  		cur = NULL;
>> +	else {
>> +		/*
>> +		 * The task_struct must be protected here to protect the
>> +		 * p->numa_faults access in the task_weight since the
>> +		 * numa_faults could already be freed in the following path:
>> +		 * finish_task_switch()
>> +		 *     --> put_task_struct()
>> +		 *         --> __put_task_struct()
>> +		 *             --> task_numa_free()
>> +		 */
>> +		get_task_struct(cur);
>> +	}
>> +
>>  	raw_spin_unlock_irq(&dst_rq->lock);
>>
>>  	/*
>> @@ -1513,9 +1521,16 @@ static void task_numa_compare(struct task_numa_env *env,
>>  		goto unlock;
>>
>>  assign:
>> +	assigned = true;
>>  	task_numa_assign(env, cur, imp);
>>  unlock:
>>  	rcu_read_unlock();
>> +	/*
>> +	 * The dst_rq->curr isn't assigned. The protection for task_struct is
>> +	 * finished.
>> +	 */
>> +	if (cur && !assigned)
>> +		put_task_struct(cur);
>>  }
>>
>>  static void task_numa_find_cpu(struct task_numa_env *env,
>>
>> .
>>