[Devel,rh7] mm: memcontrol: add memory.numa_migrate file

Submitted by Vladimir Davydov on Aug. 18, 2016, 12:44 p.m.

Details

Message ID 1471524291-6643-1-git-send-email-vdavydov@virtuozzo.com
State New
Series "mm: memcontrol: add memory.numa_migrate file"
Headers show

Commit Message

Vladimir Davydov Aug. 18, 2016, 12:44 p.m.
The new file is supposed to be used for migrating pages accounted to a
memory cgroup to a particular set of numa nodes. The reason to add it is
that currently there's no API for migrating unmapped file pages used for
storing page cache (neither migrate_pages syscall nor cpuset subsys
doesn't provide this functionality).

The file is added to the memory cgroup and has the following format:

  NODELIST[ MAX_SCAN]

where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
imposes a limit on the number of pages that can be migrated in one go.

The call may be interrupted by a signal, in which case -EINTR is returned.

https://jira.sw.ru/browse/PSBM-50875

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Igor Redko <redkoi@virtuozzo.com>
Cc: Konstantin Neumoin <kneumoin@virtuozzo.com>
---
 mm/memcontrol.c | 223 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 223 insertions(+)

Patch hide | download patch | download mbox

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e3a16b99ccc6..8c6c4fb9c153 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -54,6 +54,7 @@ 
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/virtinfo.h>
+#include <linux/migrate.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -5697,6 +5698,223 @@  static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 	seq_putc(m, '\n');
 	return 0;
 }
+
+/*
+ * memcg_numa_migrate_new_page() private argument. @target_nodes specifies the
+ * set of nodes to allocate pages from. @current_node is the current preferable
+ * node, it gets rotated after each allocation.
+ */
+struct memcg_numa_migrate_struct {
+	nodemask_t *target_nodes;
+	int current_node;
+};
+
+/*
+ * Used as an argument for migrate_pages(). Allocated pages are spread evenly
+ * among destination nodes.
+ */
+static struct page *memcg_numa_migrate_new_page(struct page *page,
+				unsigned long private, int **result)
+{
+	struct memcg_numa_migrate_struct *ms = (void *)private;
+	gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN;
+
+	ms->current_node = next_node(ms->current_node, *ms->target_nodes);
+	if (ms->current_node >= MAX_NUMNODES) {
+		ms->current_node = first_node(*ms->target_nodes);
+		BUG_ON(ms->current_node >= MAX_NUMNODES);
+	}
+
+	return __alloc_pages_nodemask(gfp_mask, 0,
+			node_zonelist(ms->current_node, gfp_mask),
+			ms->target_nodes);
+}
+
+/*
+ * Isolate at most @nr_to_scan pages from @lruvec for further migration and
+ * store them in @dst. Returns the number of pages scanned. Return value of 0
+ * means that @lruved is empty.
+ */
+static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru,
+				     long nr_to_scan, struct list_head *dst)
+{
+	struct list_head *src = &lruvec->lists[lru];
+	struct zone *zone = lruvec_zone(lruvec);
+	long scanned = 0, taken = 0;
+
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) {
+		struct page *page = list_last_entry(src, struct page, lru);
+		int nr_pages;
+
+		VM_BUG_ON_PAGE(!PageLRU(page), page);
+
+		scanned++;
+
+		switch (__isolate_lru_page(page, ISOLATE_ASYNC_MIGRATE)) {
+		case 0:
+			nr_pages = hpage_nr_pages(page);
+			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+			list_move(&page->lru, dst);
+			taken += nr_pages;
+			break;
+
+		case -EBUSY:
+			list_move(&page->lru, src);
+			continue;
+
+		default:
+			BUG();
+		}
+	}
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + is_file_lru(lru), taken);
+	spin_unlock_irq(&zone->lru_lock);
+
+	return scanned;
+}
+
+static long __memcg_numa_migrate_pages(struct lruvec *lruvec, enum lru_list lru,
+				       nodemask_t *target_nodes, long nr_to_scan)
+{
+	struct memcg_numa_migrate_struct ms = {
+		.target_nodes = target_nodes,
+		.current_node = -1,
+	};
+	LIST_HEAD(pages);
+	long total_scanned = 0;
+
+	/*
+	 * If no limit on the maximal number of migrated pages is specified,
+	 * assume the caller wants to migrate them all.
+	 */
+	if (nr_to_scan < 0)
+		nr_to_scan = mem_cgroup_get_lru_size(lruvec, lru);
+
+	while (total_scanned < nr_to_scan) {
+		int ret;
+		long scanned;
+
+		scanned = memcg_numa_isolate_pages(lruvec, lru,
+						   SWAP_CLUSTER_MAX, &pages);
+		if (!scanned)
+			break;
+
+		ret = migrate_pages(&pages, memcg_numa_migrate_new_page,
+				    (unsigned long)&ms, MIGRATE_ASYNC,
+				    MR_SYSCALL);
+		putback_lru_pages(&pages);
+		if (ret < 0)
+			return ret;
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		total_scanned += scanned;
+	}
+
+	return total_scanned;
+}
+
+/*
+ * Migrate at most @nr_to_scan pages accounted to @memcg to @target_nodes.
+ * Pages are spreaded evenly among destination nodes. If @nr_to_scan is <= 0,
+ * then the function will attempt to migrate all pages accounted to @memcg.
+ */
+static int memcg_numa_migrate_pages(struct mem_cgroup *memcg,
+				    nodemask_t *target_nodes, long nr_to_scan)
+{
+	struct mem_cgroup *mi;
+	long total_scanned = 0;
+
+again:
+	for_each_mem_cgroup_tree(mi, memcg) {
+		struct zone *zone;
+
+		for_each_populated_zone(zone) {
+			struct lruvec *lruvec;
+			enum lru_list lru;
+			long scanned;
+
+			if (node_isset(zone_to_nid(zone), *target_nodes))
+				continue;
+
+			lruvec = mem_cgroup_zone_lruvec(zone, mi);
+			/*
+			 * For the sake of simplicity, do not attempt to migrate
+			 * unevictable pages. It should be fine as long as there
+			 * aren't too many of them, which is usually true.
+			 */
+			for_each_evictable_lru(lru) {
+				scanned = __memcg_numa_migrate_pages(lruvec,
+						lru, target_nodes,
+						nr_to_scan > 0 ?
+						SWAP_CLUSTER_MAX : -1);
+				if (scanned < 0) {
+					mem_cgroup_iter_break(memcg, mi);
+					return scanned;
+				}
+				total_scanned += scanned;
+			}
+		}
+	}
+
+	if (nr_to_scan > 0 && total_scanned < nr_to_scan)
+		goto again;
+
+	return 0;
+}
+
+/*
+ * The format of memory.numa_migrate is
+ *
+ *   NODELIST[ MAX_SCAN]
+ *
+ * where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
+ * of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
+ * imposes a limit on the number of pages that can be migrated in one go.
+ *
+ * The call may be interrupted by a signal, in which case -EINTR is returned.
+ */
+static int memcg_numa_migrate_write(struct cgroup *cont,
+		struct cftype *cft, const char *buf)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	NODEMASK_ALLOC(nodemask_t, target_nodes, GFP_KERNEL);
+	const char *nodes_str = buf, *nr_str;
+	long nr_to_scan = -1;
+	int ret = -ENOMEM;
+
+	if (!target_nodes)
+		goto out;
+
+	nr_str = strchr(buf, ' ');
+	if (nr_str) {
+		nodes_str = kstrndup(buf, nr_str - buf, GFP_KERNEL);
+		if (!nodes_str)
+			goto out;
+		nr_str += 1;
+	}
+
+	ret = nodelist_parse(nodes_str, *target_nodes);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (!nodes_subset(*target_nodes, node_states[N_MEMORY]))
+		goto out;
+
+	if (nr_str && (kstrtol(nr_str, 10, &nr_to_scan) || nr_to_scan <= 0))
+		goto out;
+
+	ret = memcg_numa_migrate_pages(memcg, target_nodes, nr_to_scan);
+out:
+	if (nodes_str != buf)
+		kfree(nodes_str);
+	NODEMASK_FREE(target_nodes);
+	return ret;
+}
+
 #endif /* CONFIG_NUMA */
 
 static inline void mem_cgroup_lru_names_not_uptodate(void)
@@ -6355,6 +6573,11 @@  static struct cftype mem_cgroup_files[] = {
 		.name = "numa_stat",
 		.read_seq_string = memcg_numa_stat_show,
 	},
+	{
+		.name = "numa_migrate",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = memcg_numa_migrate_write,
+	},
 #endif
 #ifdef CONFIG_CLEANCACHE
 	{

Comments

Andrey Ryabinin Aug. 23, 2016, 9:57 a.m.
On 08/18/2016 03:44 PM, Vladimir Davydov wrote:
> The new file is supposed to be used for migrating pages accounted to a
> memory cgroup to a particular set of numa nodes. The reason to add it is
> that currently there's no API for migrating unmapped file pages used for
> storing page cache (neither migrate_pages syscall nor cpuset subsys
> doesn't provide this functionality).
> 
> The file is added to the memory cgroup and has the following format:
> 
>   NODELIST[ MAX_SCAN]
> 
> where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
> of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
> imposes a limit on the number of pages that can be migrated in one go.
> 
> The call may be interrupted by a signal, in which case -EINTR is returned.
> 
> https://jira.sw.ru/browse/PSBM-50875
> 
> Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Cc: Igor Redko <redkoi@virtuozzo.com>
> Cc: Konstantin Neumoin <kneumoin@virtuozzo.com>
> ---


echo "0 100" > /sys/fs/cgroup/memory/machine.slice/100/memory.numa_migrate

[  296.073002] BUG: soft lockup - CPU#1 stuck for 22s! [bash:4028]
[  296.073002] Modules linked in: veth xt_CHECKSUM iptable_mangle ip6t_REJECT iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT tun ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter 8021q garp mrp dm_mirror dm_region_hash dm_log ppdev crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd pcspkr sg parport_pc parport i2c_piix4 ip6_vzprivnet ip6_vznetstat pio_nfs ip_vznetstat ip_vzprivnet vziolimit vzevent vzlist vzstat vznetstat vznetdev vzmon nfsd auth_rpcgss nfs_acl lockd grace sunrpc vzdev bridge stp llc ip_tables ext4 mbcache jbd2 sd_mod crc_t10dif sr_mod crct10dif_generic cdrom ata_generic pata_acpi bochs_drm syscopyarea sysfillrect sysimgblt drm_kms_helper ttm drm crct10dif_pclmul crct10dif_common i2c_core serio_raw crc32c_intel ata_piix e1000 libata floppy
[  296.073002] irq event stamp: 440960
[  296.073002] hardirqs last  enabled at (440959): [<ffffffff823aadb3>] restore_args+0x0/0x30
[  296.073002] hardirqs last disabled at (440960): [<ffffffff823c39ed>] apic_timer_interrupt+0x6d/0x80
[  296.073002] softirqs last  enabled at (440958): [<ffffffff81158edd>] __do_softirq+0x3fd/0x7b0
[  296.073002] softirqs last disabled at (440953): [<ffffffff823c46bc>] call_softirq+0x1c/0x30
[  296.073002] CPU: 1 PID: 4028 Comm: bash ve: 0 Not tainted 3.10.0-327.28.2.ovz.17.1 #123 17.1
[  296.073002] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
[  296.073002] task: ffff8801f1c95b00 ti: ffff8801f10b8000 task.ti: ffff8801f10b8000
[  296.073002] RIP: 0010:[<ffffffff8189d422>]  [<ffffffff8189d422>] find_first_bit+0xd2/0x140
[  296.073002] RSP: 0018:ffff8801f10bfb00  EFLAGS: 00000286
[  296.073002] RAX: 0000000000000000 RBX: ffff8801f10bfce0 RCX: 1ffff1003e392caa
[  296.073002] RDX: 0000000000000000 RSI: 0000000000000400 RDI: ffffffff82d3d1c0
[  296.073002] RBP: ffff8801f10bfb00 R08: 0000000000000000 R09: 0000000000000001
[  296.073002] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801efd22260
[  296.073002] R13: dffffc0000000000 R14: 0000000000000246 R15: 0000000000000000
[  296.073002] FS:  00007efcec2d8740(0000) GS:ffff8801f6e80000(0000) knlGS:0000000000000000
[  296.073002] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  296.073002] CR2: 00007efcec2e7000 CR3: 00000001ecd7a000 CR4: 00000000001407e0
[  296.073002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  296.073002] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  296.073002] Stack:
[  296.073002]  ffff8801f10bfb18 ffffffff8148511e 0000000000000000 ffff8801f10bfd08
[  296.073002]  ffffffff81548de9 ffff8801f1c95b00 ffff8801f1c95b08 ffff8801f1c96570
[  296.073002]  ffff8801f1c96528 0000000000000000 1ffff1003e392b61 ffff8801f1c95b00
[  296.073002] Call Trace:
[  296.073002]  [<ffffffff8148511e>] first_online_pgdat+0x1e/0x70
[  296.073002]  [<ffffffff81548de9>] memcg_numa_migrate_write+0x2a9/0xa40
[  296.073002]  [<ffffffff812a3940>] ? debug_check_no_locks_freed+0x2a0/0x2a0
[  296.073002]  [<ffffffff81548b40>] ? mem_cgroup_write+0x270/0x270
[  296.073002]  [<ffffffff814a7435>] ? might_fault+0xf5/0x1a0
[  296.073002]  [<ffffffff814a7435>] ? might_fault+0xf5/0x1a0
[  296.073002]  [<ffffffff81548b40>] ? mem_cgroup_write+0x270/0x270
[  296.073002]  [<ffffffff812efb86>] cgroup_file_write+0x606/0xba0
[  296.073002]  [<ffffffff812ef580>] ? cgroup_transfer_one_task+0x80/0x80
[  296.073002]  [<ffffffff815752c6>] ? __sb_start_write+0x1b6/0x380
[  296.073002]  [<ffffffff8156dc86>] ? vfs_write+0x3d6/0x5f0
[  296.073002]  [<ffffffff8156dc86>] ? vfs_write+0x3d6/0x5f0
[  296.073002]  [<ffffffff81575110>] ? __sb_end_write+0xf0/0xf0
[  296.073002]  [<ffffffff814b0df0>] ? __pmd_alloc+0x310/0x310
[  296.073002]  [<ffffffff8156da89>] vfs_write+0x1d9/0x5f0
[  296.073002]  [<ffffffff815700ac>] SyS_write+0x16c/0x260
[  296.073002]  [<ffffffff8156ff40>] ? SyS_read+0x260/0x260
[  296.073002]  [<ffffffff823b43ce>] ? trace_do_page_fault+0x6e/0x230
[  296.073002]  [<ffffffff823aad98>] ? retint_swapgs+0x13/0x1b
[  296.073002]  [<ffffffff8189261b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[  296.073002]  [<ffffffff823c2cc9>] system_call_fastpath+0x16/0x1b


>  mm/memcontrol.c | 223 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 223 insertions(+)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e3a16b99ccc6..8c6c4fb9c153 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -54,6 +54,7 @@
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
>  #include <linux/virtinfo.h>
> +#include <linux/migrate.h>
>  #include "internal.h"
>  #include <net/sock.h>
>  #include <net/ip.h>
> @@ -5697,6 +5698,223 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
>  	seq_putc(m, '\n');
>  	return 0;
>  }
> +
> +/*
> + * memcg_numa_migrate_new_page() private argument. @target_nodes specifies the
> + * set of nodes to allocate pages from. @current_node is the current preferable
> + * node, it gets rotated after each allocation.
> + */
> +struct memcg_numa_migrate_struct {
> +	nodemask_t *target_nodes;
> +	int current_node;
> +};
> +
> +/*
> + * Used as an argument for migrate_pages(). Allocated pages are spread evenly
> + * among destination nodes.
> + */
> +static struct page *memcg_numa_migrate_new_page(struct page *page,
> +				unsigned long private, int **result)
> +{
> +	struct memcg_numa_migrate_struct *ms = (void *)private;
> +	gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN;
> +
> +	ms->current_node = next_node(ms->current_node, *ms->target_nodes);
> +	if (ms->current_node >= MAX_NUMNODES) {
> +		ms->current_node = first_node(*ms->target_nodes);
> +		BUG_ON(ms->current_node >= MAX_NUMNODES);

Maybe WARN_ON() or VM_BUG_ON() ?
> +	}
> +
> +	return __alloc_pages_nodemask(gfp_mask, 0,
> +			node_zonelist(ms->current_node, gfp_mask),
> +			ms->target_nodes);
> +}
> +
> +/*
> + * Isolate at most @nr_to_scan pages from @lruvec for further migration and
> + * store them in @dst. Returns the number of pages scanned. Return value of 0
> + * means that @lruved is empty.
> + */
> +static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru,
> +				     long nr_to_scan, struct list_head *dst)
> +{
> +	struct list_head *src = &lruvec->lists[lru];
> +	struct zone *zone = lruvec_zone(lruvec);
> +	long scanned = 0, taken = 0;
> +
> +	spin_lock_irq(&zone->lru_lock);
> +	while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) {
> +		struct page *page = list_last_entry(src, struct page, lru);
> +		int nr_pages;
> +
> +		VM_BUG_ON_PAGE(!PageLRU(page), page);
> +

__isolate_lru_page() will return -EINVAL for !PageLRU, so either this or the BUG() bellow is unnecessary.

> +		scanned++;
> +
> +		switch (__isolate_lru_page(page, ISOLATE_ASYNC_MIGRATE)) {
> +		case 0:
> +			nr_pages = hpage_nr_pages(page);
> +			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
> +			list_move(&page->lru, dst);
> +			taken += nr_pages;
> +			break;
> +
> +		case -EBUSY:
> +			list_move(&page->lru, src);
> +			continue;
> +
> +		default:
> +			BUG();
> +		}
> +	}
> +	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -taken);
> +	__mod_zone_page_state(zone, NR_ISOLATED_ANON + is_file_lru(lru), taken);
> +	spin_unlock_irq(&zone->lru_lock);
> +
> +	return scanned;
> +}
> +
> +static long __memcg_numa_migrate_pages(struct lruvec *lruvec, enum lru_list lru,
> +				       nodemask_t *target_nodes, long nr_to_scan)
> +{
> +	struct memcg_numa_migrate_struct ms = {
> +		.target_nodes = target_nodes,
> +		.current_node = -1,
> +	};
> +	LIST_HEAD(pages);
> +	long total_scanned = 0;
> +
> +	/*
> +	 * If no limit on the maximal number of migrated pages is specified,
> +	 * assume the caller wants to migrate them all.
> +	 */
> +	if (nr_to_scan < 0)
> +		nr_to_scan = mem_cgroup_get_lru_size(lruvec, lru);
> +
> +	while (total_scanned < nr_to_scan) {
> +		int ret;
> +		long scanned;
> +
> +		scanned = memcg_numa_isolate_pages(lruvec, lru,
> +						   SWAP_CLUSTER_MAX, &pages);
> +		if (!scanned)
> +			break;
> +
> +		ret = migrate_pages(&pages, memcg_numa_migrate_new_page,
> +				    (unsigned long)&ms, MIGRATE_ASYNC,
> +				    MR_SYSCALL);
> +		putback_lru_pages(&pages);
> +		if (ret < 0)
> +			return ret;
> +
> +		if (signal_pending(current))
> +			return -EINTR;
> +
> +		total_scanned += scanned;
> +	}
> +
> +	return total_scanned;
> +}
> +
> +/*
> + * Migrate at most @nr_to_scan pages accounted to @memcg to @target_nodes.
> + * Pages are spreaded evenly among destination nodes. If @nr_to_scan is <= 0,
> + * then the function will attempt to migrate all pages accounted to @memcg.
> + */
> +static int memcg_numa_migrate_pages(struct mem_cgroup *memcg,
> +				    nodemask_t *target_nodes, long nr_to_scan)
> +{
> +	struct mem_cgroup *mi;
> +	long total_scanned = 0;
> +
> +again:
> +	for_each_mem_cgroup_tree(mi, memcg) {
> +		struct zone *zone;
> +
> +		for_each_populated_zone(zone) {
> +			struct lruvec *lruvec;
> +			enum lru_list lru;
> +			long scanned;
> +
> +			if (node_isset(zone_to_nid(zone), *target_nodes))
> +				continue;
> +
> +			lruvec = mem_cgroup_zone_lruvec(zone, mi);
> +			/*
> +			 * For the sake of simplicity, do not attempt to migrate
> +			 * unevictable pages. It should be fine as long as there
> +			 * aren't too many of them, which is usually true.
> +			 */
> +			for_each_evictable_lru(lru) {
> +				scanned = __memcg_numa_migrate_pages(lruvec,
> +						lru, target_nodes,
> +						nr_to_scan > 0 ?
> +						SWAP_CLUSTER_MAX : -1);

					Shouldn't we just pass nr_to_scan here?

> +				if (scanned < 0) {
> +					mem_cgroup_iter_break(memcg, mi);
> +					return scanned;
> +				}
> +				total_scanned += scanned;
> +			}
> +		}
> +	}
> +
> +	if (nr_to_scan > 0 && total_scanned < nr_to_scan)
> +		goto again;
> +
> +	return 0;
> +}
> +
> +/*
> + * The format of memory.numa_migrate is
> + *
> + *   NODELIST[ MAX_SCAN]
> + *
> + * where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
> + * of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
> + * imposes a limit on the number of pages that can be migrated in one go.
> + *
> + * The call may be interrupted by a signal, in which case -EINTR is returned.
> + */
> +static int memcg_numa_migrate_write(struct cgroup *cont,
> +		struct cftype *cft, const char *buf)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
> +	NODEMASK_ALLOC(nodemask_t, target_nodes, GFP_KERNEL);
> +	const char *nodes_str = buf, *nr_str;
> +	long nr_to_scan = -1;
> +	int ret = -ENOMEM;
> +
> +	if (!target_nodes)
> +		goto out;
> +
> +	nr_str = strchr(buf, ' ');
> +	if (nr_str) {
> +		nodes_str = kstrndup(buf, nr_str - buf, GFP_KERNEL);
> +		if (!nodes_str)
> +			goto out;
> +		nr_str += 1;
> +	}
> +
> +	ret = nodelist_parse(nodes_str, *target_nodes);
> +	if (ret)
> +		goto out;
> +
> +	ret = -EINVAL;
> +	if (!nodes_subset(*target_nodes, node_states[N_MEMORY]))
> +		goto out;
> +
> +	if (nr_str && (kstrtol(nr_str, 10, &nr_to_scan) || nr_to_scan <= 0))
> +		goto out;
> +
> +	ret = memcg_numa_migrate_pages(memcg, target_nodes, nr_to_scan);
> +out:
> +	if (nodes_str != buf)
> +		kfree(nodes_str);
> +	NODEMASK_FREE(target_nodes);
> +	return ret;
> +}
> +
>  #endif /* CONFIG_NUMA */
>  
>  static inline void mem_cgroup_lru_names_not_uptodate(void)
> @@ -6355,6 +6573,11 @@ static struct cftype mem_cgroup_files[] = {
>  		.name = "numa_stat",
>  		.read_seq_string = memcg_numa_stat_show,
>  	},
> +	{
> +		.name = "numa_migrate",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.write_string = memcg_numa_migrate_write,
> +	},
>  #endif
>  #ifdef CONFIG_CLEANCACHE
>  	{
>
Vladimir Davydov Aug. 23, 2016, 10:27 a.m.
On Tue, Aug 23, 2016 at 12:57:53PM +0300, Andrey Ryabinin wrote:
...
> echo "0 100" > /sys/fs/cgroup/memory/machine.slice/100/memory.numa_migrate
> 
> [  296.073002] BUG: soft lockup - CPU#1 stuck for 22s! [bash:4028]

Thanks for catching, will fix in v2.

> > +static struct page *memcg_numa_migrate_new_page(struct page *page,
> > +				unsigned long private, int **result)
> > +{
> > +	struct memcg_numa_migrate_struct *ms = (void *)private;
> > +	gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN;
> > +
> > +	ms->current_node = next_node(ms->current_node, *ms->target_nodes);
> > +	if (ms->current_node >= MAX_NUMNODES) {
> > +		ms->current_node = first_node(*ms->target_nodes);
> > +		BUG_ON(ms->current_node >= MAX_NUMNODES);
> 
> Maybe WARN_ON() or VM_BUG_ON() ?

Will replace with VM_BUG_ON.

> > +	}
> > +
> > +	return __alloc_pages_nodemask(gfp_mask, 0,
> > +			node_zonelist(ms->current_node, gfp_mask),
> > +			ms->target_nodes);
> > +}
> > +
> > +/*
> > + * Isolate at most @nr_to_scan pages from @lruvec for further migration and
> > + * store them in @dst. Returns the number of pages scanned. Return value of 0
> > + * means that @lruved is empty.
> > + */
> > +static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru,
> > +				     long nr_to_scan, struct list_head *dst)
> > +{
> > +	struct list_head *src = &lruvec->lists[lru];
> > +	struct zone *zone = lruvec_zone(lruvec);
> > +	long scanned = 0, taken = 0;
> > +
> > +	spin_lock_irq(&zone->lru_lock);
> > +	while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) {
> > +		struct page *page = list_last_entry(src, struct page, lru);
> > +		int nr_pages;
> > +
> > +		VM_BUG_ON_PAGE(!PageLRU(page), page);
> > +
> 
> __isolate_lru_page() will return -EINVAL for !PageLRU, so either this or the BUG() bellow is unnecessary.

OK, will remove the VM_BUG_ON_PAGE.

...
> > +static int memcg_numa_migrate_pages(struct mem_cgroup *memcg,
> > +				    nodemask_t *target_nodes, long nr_to_scan)
> > +{
> > +	struct mem_cgroup *mi;
> > +	long total_scanned = 0;
> > +
> > +again:
> > +	for_each_mem_cgroup_tree(mi, memcg) {
> > +		struct zone *zone;
> > +
> > +		for_each_populated_zone(zone) {
> > +			struct lruvec *lruvec;
> > +			enum lru_list lru;
> > +			long scanned;
> > +
> > +			if (node_isset(zone_to_nid(zone), *target_nodes))
> > +				continue;
> > +
> > +			lruvec = mem_cgroup_zone_lruvec(zone, mi);
> > +			/*
> > +			 * For the sake of simplicity, do not attempt to migrate
> > +			 * unevictable pages. It should be fine as long as there
> > +			 * aren't too many of them, which is usually true.
> > +			 */
> > +			for_each_evictable_lru(lru) {
> > +				scanned = __memcg_numa_migrate_pages(lruvec,
> > +						lru, target_nodes,
> > +						nr_to_scan > 0 ?
> > +						SWAP_CLUSTER_MAX : -1);
> 
> 					Shouldn't we just pass nr_to_scan here?

No, I want to migrate memory evenly from all nodes. I.e. if you have 2
source nodes and nr_to_scan=100, there should be ~50 pages migrated from
one node and ~50 from another, not 100-vs-0.