[RHEL8,COMMIT] oom: resurrect berserker mode

Submitted by Konstantin Khorenko on Dec. 21, 2020, 4:49 p.m.

Details

Message ID 202012211649.0BLGnWTg227012@finist-co8.sw.ru
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko Dec. 21, 2020, 4:49 p.m.
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
------>
commit fd0c0eddf619ad335ed60170bdb7024e6df818d6
Author: Vladimir Davydov <vdavydov.dev@gmail.com>
Date:   Mon Dec 21 19:49:32 2020 +0300

    oom: resurrect berserker mode
    
    The logic behind the OOM berserker is the same as in PCS6: if processes
    are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
    default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
    youngest worst processes if "rage" >= 0.
    
    https://jira.sw.ru/browse/PSBM-17930
    
    Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
    
    [aryabinin: vz8 rebase]
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/linux/memcontrol.h |  6 +++
 include/linux/oom.h        |  5 +++
 mm/oom_kill.c              | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+)

Patch hide | download patch | download mbox

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 917e6ab9b1ab..d4d49160ee40 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -258,6 +258,12 @@  struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
+	int		oom_rage;
+	spinlock_t	oom_rage_lock;
+	unsigned long	prev_oom_time;
+	unsigned long	oom_time;
+
+
 	/* memory.events */
 	struct cgroup_file events_file;
 
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 0dc94a5bad9e..8ae3aaa00a0f 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,10 @@  enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+
+#define OOM_BASE_RAGE	-10
+#define OOM_MAX_RAGE	20
+
 /*
  * Details of the page allocation that triggered the oom killer that are used to
  * determine what should be killed.
@@ -51,6 +55,7 @@  struct oom_control {
 	unsigned long totalpages;
 	struct task_struct *chosen;
 	unsigned long chosen_points;
+	unsigned long overdraft;
 
 	/* Used to print the constraint info. */
 	enum oom_constraint constraint;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fe34e85f62ec..353fb22da98c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,6 +53,7 @@ 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
 
 DEFINE_MUTEX(oom_lock);
 
@@ -955,6 +956,101 @@  static int oom_kill_memcg_member(struct task_struct *task, void *message)
 	return 0;
 }
 
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(struct oom_control *oc)
+{
+	static DEFINE_RATELIMIT_STATE(berserker_rs,
+				DEFAULT_RATELIMIT_INTERVAL,
+				DEFAULT_RATELIMIT_BURST);
+	struct task_struct *p;
+	struct mem_cgroup *memcg;
+	unsigned long now = jiffies;
+	int rage;
+	int killed = 0;
+
+	memcg = oc->memcg ?: root_mem_cgroup;
+
+	spin_lock(&memcg->oom_rage_lock);
+	memcg->prev_oom_time = memcg->oom_time;
+	memcg->oom_time = now;
+	/*
+	 * Increase rage if oom happened recently in this context, reset
+	 * rage otherwise.
+	 *
+	 * previous oom                            this oom (unfinished)
+	 * +++++++++----------------------------++++++++
+	 *        ^                                    ^
+	 *  prev_oom_time  <<oom_relaxation>>      oom_time
+	 */
+	if (time_after(now, memcg->prev_oom_time + sysctl_oom_relaxation))
+		memcg->oom_rage = OOM_BASE_RAGE;
+	else if (memcg->oom_rage < OOM_MAX_RAGE)
+		memcg->oom_rage++;
+	rage = memcg->oom_rage;
+	spin_unlock(&memcg->oom_rage_lock);
+
+	if (rage < 0)
+		return;
+
+	/*
+	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
+	 * as bad as the victim.
+	 */
+	read_lock(&tasklist_lock);
+	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+		unsigned long tsk_points;
+		unsigned long tsk_overdraft;
+
+		if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+			fatal_signal_pending(p) || p->flags & PF_EXITING ||
+			oom_unkillable_task(p, oc->memcg, oc->nodemask))
+			continue;
+
+		tsk_points = oom_badness(p, oc->memcg, oc->nodemask,
+					oc->totalpages, &tsk_overdraft);
+		if (tsk_overdraft < oc->overdraft)
+			continue;
+
+		/*
+		 * oom_badness never returns a negative value, even if
+		 * oom_score_adj would make badness so, instead it
+		 * returns 1. So we do not kill task with badness 1 if
+		 * the victim has badness > 1 so as not to risk killing
+		 * protected tasks.
+		 */
+		if (tsk_points <= 1 && oc->chosen_points > 1)
+			continue;
+
+		/*
+		 * Consider tasks as equally bad if they have equal
+		 * normalized scores.
+		 */
+		if (tsk_points * 1000 / oc->totalpages <
+			oc->chosen_points * 1000 / oc->totalpages)
+			continue;
+
+		if (__ratelimit(&berserker_rs)) {
+			task_lock(p);
+			pr_err("Rage kill process %d (%s)\n",
+				task_pid_nr(p), p->comm);
+			task_unlock(p);
+		}
+
+		count_vm_event(OOM_KILL);
+		memcg_memory_event(memcg, MEMCG_OOM_KILL);
+
+		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+
+		if (++killed >= 1 << rage)
+			break;
+	}
+	read_unlock(&tasklist_lock);
+
+	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
 static void oom_kill_process(struct oom_control *oc, const char *message)
 {
 	struct task_struct *victim = oc->chosen;
@@ -998,6 +1094,7 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 				      (void*)message);
 		mem_cgroup_put(oom_group);
 	}
+	oom_berserker(oc);
 }
 
 /*