[3/5] userns: Don't read extents twice in m_start

Submitted by Peter Zijlstra on Nov. 1, 2017, 1:05 p.m.

Details

Message ID 20171101130539.j5bxmhs2trqurrr2@hirez.programming.kicks-ass.net
State New
Series "Series without cover letter"
Headers show

Commit Message

Peter Zijlstra Nov. 1, 2017, 1:05 p.m.
On Wed, Nov 01, 2017 at 06:08:35AM -0500, Eric W. Biederman wrote:
> Nikolay Borisov <nborisov@suse.com> writes:
> 
> > On  1.11.2017 01:48, Eric W. Biederman wrote:
> >> 
> >> This is important so reading /proc/<pid>/{uid_map,gid_map,projid_map} while
> >> the map is being written does not do strange things.
> >> 
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >> ---
> >>  kernel/user_namespace.c | 6 ++++--
> >>  1 file changed, 4 insertions(+), 2 deletions(-)
> >> 
> >> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> >> index 563a2981d7c7..4f7e357ac1e2 100644
> >> --- a/kernel/user_namespace.c
> >> +++ b/kernel/user_namespace.c
> >> @@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
> >>  		     struct uid_gid_map *map)
> >>  {
> >>  	loff_t pos = *ppos;
> >> +	unsigned extents = map->nr_extents;
> >> +	smp_rmb();
> >
> > Barriers need to be paired to work correctly as well as have explicit
> > comments describing the pairing as per kernel coding style. Checkpatch
> > will actually produce warning for that particular memory barrier.
> 
> So please look at the code and read the comment.

What comment, there isn't any, which is what he's complaining about.

> The fact the barrier was not in m_start earlier is strictly speaking a
> bug.

Sure; doesn't excuse you for not writing sensible comments to go with
it.

> In practice except for a very narrow window when this data is changing
> the one time it can, this code does not matter at all.
> 
> As for checkpatch I have sympathy for it, checkpatch has a hard job,
> but I won't listen to checkpatch when it is wrong.

No, undocumented barriers are a royal pain. Memory barriers should come
with a comment that describes the desired ordering and points to the
pairing barrier(s).

Also, you probably want READ_ONCE() here and WRITE_ONCE() in
map_write(), the compiler is free to do unordered byte loads/stores
without it.

And finally, did you want to use smp_store_release() and
smp_load_acquire() instead?

Something like so perhaps?

---
 kernel/user_namespace.c | 73 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 28 deletions(-)

Patch hide | download patch | download mbox

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f758911cabd5 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -25,8 +25,47 @@ 
 #include <linux/fs_struct.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
+
+/*
+ * The userns_state_mutex serializes all writes to any given map.
+ *
+ * Any map is only ever written once.
+ *
+ * An id map fits within 1 cache line on most architectures.
+ */
 static DEFINE_MUTEX(userns_state_mutex);
 
+/*
+ *
+ * There is a one time data dependency between reading the count of the extents
+ * and the values of the extents.  The desired behavior is to see the values of
+ * the extents that were written before the count of the extents.
+ *
+ * To achieve this smp_store_release() is used on guarantee the write order and
+ * smp_load_acquire() is guaranteed that we don't have weakly ordered
+ * architectures returning stale data.
+ */
+static inline void map_store_extents(struct uid_gid_map *map, unsigned int extents)
+{
+	/*
+	 * Ensure the map->extent[] stores happen-before we grow map->nr_extents
+	 * to cover it. Matches the load_acquire in map_load_extents().
+	 */
+	smp_store_release(&map->nr_extents, extents);
+}
+
+static inline unsigned int map_load_extents(struct uid_gid_map *map)
+{
+	/*
+	 * Ensure the map->nr_extents load happens-before we try and access
+	 * map->extent[], such that we guarantee the data is in fact there.
+	 *
+	 * Matches the store-relese in map_store_extents().
+	 */
+	return smp_load_acquire(&map->nr_extents);
+}
+
+
 static bool new_idmap_permitted(const struct file *file,
 				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
@@ -206,8 +245,7 @@  static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	id2 = id + count - 1;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
+	extents = map_load_extents(map);
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
@@ -230,8 +268,7 @@  static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
+	extents = map_load_extents(map);
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
@@ -253,8 +290,7 @@  static u32 map_id_up(struct uid_gid_map *map, u32 id)
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
+	extents = map_load_extents(map);
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].lower_first;
 		last = first + map->extent[idx].count - 1;
@@ -543,7 +579,7 @@  static void *m_start(struct seq_file *seq, loff_t *ppos,
 	struct uid_gid_extent *extent = NULL;
 	loff_t pos = *ppos;
 
-	if (pos < map->nr_extents)
+	if (pos < map_load_extents(map))
 		extent = &map->extent[pos];
 
 	return extent;
@@ -652,25 +688,6 @@  static ssize_t map_write(struct file *file, const char __user *buf,
 	char *kbuf = NULL, *pos, *next_line;
 	ssize_t ret = -EINVAL;
 
-	/*
-	 * The userns_state_mutex serializes all writes to any given map.
-	 *
-	 * Any map is only ever written once.
-	 *
-	 * An id map fits within 1 cache line on most architectures.
-	 *
-	 * On read nothing needs to be done unless you are on an
-	 * architecture with a crazy cache coherency model like alpha.
-	 *
-	 * There is a one time data dependency between reading the
-	 * count of the extents and the values of the extents.  The
-	 * desired behavior is to see the values of the extents that
-	 * were written before the count of the extents.
-	 *
-	 * To achieve this smp_wmb() is used on guarantee the write
-	 * order and smp_rmb() is guaranteed that we don't have crazy
-	 * architectures returning stale data.
-	 */
 	mutex_lock(&userns_state_mutex);
 
 	ret = -EPERM;
@@ -790,8 +807,8 @@  static ssize_t map_write(struct file *file, const char __user *buf,
 	/* Install the map */
 	memcpy(map->extent, new_map.extent,
 		new_map.nr_extents*sizeof(new_map.extent[0]));
-	smp_wmb();
-	map->nr_extents = new_map.nr_extents;
+
+	map_store_extents(map, new_map.nr_extents);
 
 	*ppos = count;
 	ret = count;

Comments

Christian Brauner Nov. 1, 2017, 2:01 p.m.
On Wed, Nov 01, 2017 at 02:05:39PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 01, 2017 at 06:08:35AM -0500, Eric W. Biederman wrote:
> > Nikolay Borisov <nborisov@suse.com> writes:
> > 
> > > On  1.11.2017 01:48, Eric W. Biederman wrote:
> > >> 
> > >> This is important so reading /proc/<pid>/{uid_map,gid_map,projid_map} while
> > >> the map is being written does not do strange things.
> > >> 
> > >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> > >> ---
> > >>  kernel/user_namespace.c | 6 ++++--
> > >>  1 file changed, 4 insertions(+), 2 deletions(-)
> > >> 
> > >> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> > >> index 563a2981d7c7..4f7e357ac1e2 100644
> > >> --- a/kernel/user_namespace.c
> > >> +++ b/kernel/user_namespace.c
> > >> @@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
> > >>  		     struct uid_gid_map *map)
> > >>  {
> > >>  	loff_t pos = *ppos;
> > >> +	unsigned extents = map->nr_extents;
> > >> +	smp_rmb();
> > >
> > > Barriers need to be paired to work correctly as well as have explicit
> > > comments describing the pairing as per kernel coding style. Checkpatch
> > > will actually produce warning for that particular memory barrier.
> > 
> > So please look at the code and read the comment.
> 
> What comment, there isn't any, which is what he's complaining about.
> 
> > The fact the barrier was not in m_start earlier is strictly speaking a
> > bug.
> 
> Sure; doesn't excuse you for not writing sensible comments to go with
> it.
> 
> > In practice except for a very narrow window when this data is changing
> > the one time it can, this code does not matter at all.
> > 
> > As for checkpatch I have sympathy for it, checkpatch has a hard job,
> > but I won't listen to checkpatch when it is wrong.
> 
> No, undocumented barriers are a royal pain. Memory barriers should come
> with a comment that describes the desired ordering and points to the
> pairing barrier(s).

Tbf, this isn't solely Eric's fault. I'm to blame here too since I didn't
document the already existing smb_rmb()s and the new one I introduced when
writing the patches. I didn't know that there was a hard-set requirement to
document those. I also didn't see anything in the kernel coding style or the
memory barriers documentation (But it has been some time since I read those.).

> 
> Also, you probably want READ_ONCE() here and WRITE_ONCE() in
> map_write(), the compiler is free to do unordered byte loads/stores
> without it.
> 
> And finally, did you want to use smp_store_release() and
> smp_load_acquire() instead?

Maybe a stupid question but do you suspect this is a real problem in this case
since you're phrasing it as a question? Iirc, *_acquire() operations include
locking operations and might come with a greater performance impact then
smb_{rmb,wmb}(). Given that this is a very performance critical path we should
be sure.

> 
> Something like so perhaps?
> 
> ---
>  kernel/user_namespace.c | 73 ++++++++++++++++++++++++++++++-------------------
>  1 file changed, 45 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index c490f1e4313b..f758911cabd5 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -25,8 +25,47 @@
>  #include <linux/fs_struct.h>
>  
>  static struct kmem_cache *user_ns_cachep __read_mostly;
> +
> +/*
> + * The userns_state_mutex serializes all writes to any given map.
> + *
> + * Any map is only ever written once.
> + *
> + * An id map fits within 1 cache line on most architectures.
> + */
>  static DEFINE_MUTEX(userns_state_mutex);
>  
> +/*
> + *
> + * There is a one time data dependency between reading the count of the extents
> + * and the values of the extents.  The desired behavior is to see the values of
> + * the extents that were written before the count of the extents.
> + *
> + * To achieve this smp_store_release() is used on guarantee the write order and
> + * smp_load_acquire() is guaranteed that we don't have weakly ordered
> + * architectures returning stale data.
> + */
> +static inline void map_store_extents(struct uid_gid_map *map, unsigned int extents)
> +{
> +	/*
> +	 * Ensure the map->extent[] stores happen-before we grow map->nr_extents
> +	 * to cover it. Matches the load_acquire in map_load_extents().
> +	 */
> +	smp_store_release(&map->nr_extents, extents);
> +}
> +
> +static inline unsigned int map_load_extents(struct uid_gid_map *map)
> +{
> +	/*
> +	 * Ensure the map->nr_extents load happens-before we try and access
> +	 * map->extent[], such that we guarantee the data is in fact there.
> +	 *
> +	 * Matches the store-relese in map_store_extents().
> +	 */
> +	return smp_load_acquire(&map->nr_extents);
> +}
> +
> +
>  static bool new_idmap_permitted(const struct file *file,
>  				struct user_namespace *ns, int cap_setid,
>  				struct uid_gid_map *map);
> @@ -206,8 +245,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
>  	id2 = id + count - 1;
>  
>  	/* Find the matching extent */
> -	extents = map->nr_extents;
> -	smp_rmb();
> +	extents = map_load_extents(map);
>  	for (idx = 0; idx < extents; idx++) {
>  		first = map->extent[idx].first;
>  		last = first + map->extent[idx].count - 1;
> @@ -230,8 +268,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
>  	u32 first, last;
>  
>  	/* Find the matching extent */
> -	extents = map->nr_extents;
> -	smp_rmb();
> +	extents = map_load_extents(map);
>  	for (idx = 0; idx < extents; idx++) {
>  		first = map->extent[idx].first;
>  		last = first + map->extent[idx].count - 1;
> @@ -253,8 +290,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
>  	u32 first, last;
>  
>  	/* Find the matching extent */
> -	extents = map->nr_extents;
> -	smp_rmb();
> +	extents = map_load_extents(map);
>  	for (idx = 0; idx < extents; idx++) {
>  		first = map->extent[idx].lower_first;
>  		last = first + map->extent[idx].count - 1;
> @@ -543,7 +579,7 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
>  	struct uid_gid_extent *extent = NULL;
>  	loff_t pos = *ppos;
>  
> -	if (pos < map->nr_extents)
> +	if (pos < map_load_extents(map))
>  		extent = &map->extent[pos];
>  
>  	return extent;
> @@ -652,25 +688,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
>  	char *kbuf = NULL, *pos, *next_line;
>  	ssize_t ret = -EINVAL;
>  
> -	/*
> -	 * The userns_state_mutex serializes all writes to any given map.
> -	 *
> -	 * Any map is only ever written once.
> -	 *
> -	 * An id map fits within 1 cache line on most architectures.
> -	 *
> -	 * On read nothing needs to be done unless you are on an
> -	 * architecture with a crazy cache coherency model like alpha.
> -	 *
> -	 * There is a one time data dependency between reading the
> -	 * count of the extents and the values of the extents.  The
> -	 * desired behavior is to see the values of the extents that
> -	 * were written before the count of the extents.
> -	 *
> -	 * To achieve this smp_wmb() is used on guarantee the write
> -	 * order and smp_rmb() is guaranteed that we don't have crazy
> -	 * architectures returning stale data.
> -	 */
>  	mutex_lock(&userns_state_mutex);
>  
>  	ret = -EPERM;
> @@ -790,8 +807,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
>  	/* Install the map */
>  	memcpy(map->extent, new_map.extent,
>  		new_map.nr_extents*sizeof(new_map.extent[0]));
> -	smp_wmb();
> -	map->nr_extents = new_map.nr_extents;
> +
> +	map_store_extents(map, new_map.nr_extents);
>  
>  	*ppos = count;
>  	ret = count;
Peter Zijlstra Nov. 1, 2017, 2:16 p.m.
On Wed, Nov 01, 2017 at 03:01:45PM +0100, Christian Brauner wrote:
> Tbf, this isn't solely Eric's fault. I'm to blame here too since I didn't
> document the already existing smb_rmb()s and the new one I introduced when
> writing the patches. I didn't know that there was a hard-set requirement to
> document those. I also didn't see anything in the kernel coding style or the
> memory barriers documentation (But it has been some time since I read those.).

There's too many documents to read.. I'm not sure we changed
coding-style, and I suspect that'll just end up being another bike-shed
in any case.

We did get checkpatch changed though, which is a strong enough clue that
something needs to happen.

But What Nikolay said; memory ordering is hard enough if you're clear on
what exactly you intend to do. But if you later try and reconstruct
without comments, its nearly impossible.

It gets even better if someone changes the ordering requirements over
time and you grow hidden and non-obvious dependencies :/

> > Also, you probably want READ_ONCE() here and WRITE_ONCE() in
> > map_write(), the compiler is free to do unordered byte loads/stores
> > without it.
> > 
> > And finally, did you want to use smp_store_release() and
> > smp_load_acquire() instead?
> 
> Maybe a stupid question but do you suspect this is a real problem in
> this case since you're phrasing it as a question?

Rhetorical question mostly, I suspect its just what you meant to do, as
per the proposed patch.

> Iirc, *_acquire() operations include
> locking operations and might come with a greater performance impact then
> smb_{rmb,wmb}(). Given that this is a very performance critical path we should
> be sure.

No locking what so ever. LOAD-ACQUIRE and STORE-RELEASE are memory ordering
flavours that are paired with the respective memory operation.

It is true that locking ops provide these exact orderings, but that
doesn't imply the reverse.

In short, store-release is a store that ensures all prior load _and_
stores happen-before this store. A load-acquire is a load which
happens-before any subsequent load or stores.

But a release does not constrain later loads or stores, and an acquire
does not constrain prior load or stores.
Christian Brauner Nov. 1, 2017, 4:29 p.m.
On Wed, Nov 01, 2017 at 03:16:54PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 01, 2017 at 03:01:45PM +0100, Christian Brauner wrote:
> > Tbf, this isn't solely Eric's fault. I'm to blame here too since I didn't
> > document the already existing smb_rmb()s and the new one I introduced when
> > writing the patches. I didn't know that there was a hard-set requirement to
> > document those. I also didn't see anything in the kernel coding style or the
> > memory barriers documentation (But it has been some time since I read those.).
> 
> There's too many documents to read.. I'm not sure we changed
> coding-style, and I suspect that'll just end up being another bike-shed
> in any case.
> 
> We did get checkpatch changed though, which is a strong enough clue that
> something needs to happen.
> 
> But What Nikolay said; memory ordering is hard enough if you're clear on
> what exactly you intend to do. But if you later try and reconstruct
> without comments, its nearly impossible.

Yeah, agreed. I was happy to see that Eric explained his smp_wmb() in detail.
That was quite helpful in figuring this out!

> 
> It gets even better if someone changes the ordering requirements over
> time and you grow hidden and non-obvious dependencies :/
> 
> > > Also, you probably want READ_ONCE() here and WRITE_ONCE() in
> > > map_write(), the compiler is free to do unordered byte loads/stores
> > > without it.
> > > 
> > > And finally, did you want to use smp_store_release() and
> > > smp_load_acquire() instead?
> > 
> > Maybe a stupid question but do you suspect this is a real problem in
> > this case since you're phrasing it as a question?
> 
> Rhetorical question mostly, I suspect its just what you meant to do, as
> per the proposed patch.
> 
> > Iirc, *_acquire() operations include
> > locking operations and might come with a greater performance impact then
> > smb_{rmb,wmb}(). Given that this is a very performance critical path we should
> > be sure.
> 
> No locking what so ever. LOAD-ACQUIRE and STORE-RELEASE are memory ordering
> flavours that are paired with the respective memory operation.

Ah right, now I remember I was confused by a part of the memory barriers
documentation that referenced locks. Acquire operations include locks and
smp_load_acquire().  Right, should've remembered that. Thanks!

> 
> It is true that locking ops provide these exact orderings, but that
> doesn't imply the reverse.
> 
> In short, store-release is a store that ensures all prior load _and_
> stores happen-before this store. A load-acquire is a load which
> happens-before any subsequent load or stores.
> 
> But a release does not constrain later loads or stores, and an acquire
> does not constrain prior load or stores.
> 
>
Christian Brauner Nov. 1, 2017, 4:31 p.m.
On Wed, Nov 01, 2017 at 02:05:39PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 01, 2017 at 06:08:35AM -0500, Eric W. Biederman wrote:
> > Nikolay Borisov <nborisov@suse.com> writes:
> > 
> > > On  1.11.2017 01:48, Eric W. Biederman wrote:
> > >> 
> > >> This is important so reading /proc/<pid>/{uid_map,gid_map,projid_map} while
> > >> the map is being written does not do strange things.
> > >> 
> > >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> > >> ---
> > >>  kernel/user_namespace.c | 6 ++++--
> > >>  1 file changed, 4 insertions(+), 2 deletions(-)
> > >> 
> > >> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> > >> index 563a2981d7c7..4f7e357ac1e2 100644
> > >> --- a/kernel/user_namespace.c
> > >> +++ b/kernel/user_namespace.c
> > >> @@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
> > >>  		     struct uid_gid_map *map)
> > >>  {
> > >>  	loff_t pos = *ppos;
> > >> +	unsigned extents = map->nr_extents;
> > >> +	smp_rmb();
> > >
> > > Barriers need to be paired to work correctly as well as have explicit
> > > comments describing the pairing as per kernel coding style. Checkpatch
> > > will actually produce warning for that particular memory barrier.
> > 
> > So please look at the code and read the comment.
> 
> What comment, there isn't any, which is what he's complaining about.
> 
> > The fact the barrier was not in m_start earlier is strictly speaking a
> > bug.
> 
> Sure; doesn't excuse you for not writing sensible comments to go with
> it.
> 
> > In practice except for a very narrow window when this data is changing
> > the one time it can, this code does not matter at all.
> > 
> > As for checkpatch I have sympathy for it, checkpatch has a hard job,
> > but I won't listen to checkpatch when it is wrong.
> 
> No, undocumented barriers are a royal pain. Memory barriers should come
> with a comment that describes the desired ordering and points to the
> pairing barrier(s).
> 
> Also, you probably want READ_ONCE() here and WRITE_ONCE() in
> map_write(), the compiler is free to do unordered byte loads/stores
> without it.
> 
> And finally, did you want to use smp_store_release() and
> smp_load_acquire() instead?
> 
> Something like so perhaps?

Given the discussion this looks fine to me unless Eric stops some issues I'm
too blind to see. So for whatever this is worth this patch would get an ack from
me.

> 
> ---
>  kernel/user_namespace.c | 73 ++++++++++++++++++++++++++++++-------------------
>  1 file changed, 45 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index c490f1e4313b..f758911cabd5 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -25,8 +25,47 @@
>  #include <linux/fs_struct.h>
>  
>  static struct kmem_cache *user_ns_cachep __read_mostly;
> +
> +/*
> + * The userns_state_mutex serializes all writes to any given map.
> + *
> + * Any map is only ever written once.
> + *
> + * An id map fits within 1 cache line on most architectures.
> + */
>  static DEFINE_MUTEX(userns_state_mutex);
>  
> +/*
> + *
> + * There is a one time data dependency between reading the count of the extents
> + * and the values of the extents.  The desired behavior is to see the values of
> + * the extents that were written before the count of the extents.
> + *
> + * To achieve this smp_store_release() is used on guarantee the write order and
> + * smp_load_acquire() is guaranteed that we don't have weakly ordered
> + * architectures returning stale data.
> + */
> +static inline void map_store_extents(struct uid_gid_map *map, unsigned int extents)
> +{
> +	/*
> +	 * Ensure the map->extent[] stores happen-before we grow map->nr_extents
> +	 * to cover it. Matches the load_acquire in map_load_extents().
> +	 */
> +	smp_store_release(&map->nr_extents, extents);
> +}
> +
> +static inline unsigned int map_load_extents(struct uid_gid_map *map)
> +{
> +	/*
> +	 * Ensure the map->nr_extents load happens-before we try and access
> +	 * map->extent[], such that we guarantee the data is in fact there.
> +	 *
> +	 * Matches the store-relese in map_store_extents().
> +	 */
> +	return smp_load_acquire(&map->nr_extents);
> +}
> +
> +
>  static bool new_idmap_permitted(const struct file *file,
>  				struct user_namespace *ns, int cap_setid,
>  				struct uid_gid_map *map);
> @@ -206,8 +245,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
>  	id2 = id + count - 1;
>  
>  	/* Find the matching extent */
> -	extents = map->nr_extents;
> -	smp_rmb();
> +	extents = map_load_extents(map);
>  	for (idx = 0; idx < extents; idx++) {
>  		first = map->extent[idx].first;
>  		last = first + map->extent[idx].count - 1;
> @@ -230,8 +268,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
>  	u32 first, last;
>  
>  	/* Find the matching extent */
> -	extents = map->nr_extents;
> -	smp_rmb();
> +	extents = map_load_extents(map);
>  	for (idx = 0; idx < extents; idx++) {
>  		first = map->extent[idx].first;
>  		last = first + map->extent[idx].count - 1;
> @@ -253,8 +290,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
>  	u32 first, last;
>  
>  	/* Find the matching extent */
> -	extents = map->nr_extents;
> -	smp_rmb();
> +	extents = map_load_extents(map);
>  	for (idx = 0; idx < extents; idx++) {
>  		first = map->extent[idx].lower_first;
>  		last = first + map->extent[idx].count - 1;
> @@ -543,7 +579,7 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
>  	struct uid_gid_extent *extent = NULL;
>  	loff_t pos = *ppos;
>  
> -	if (pos < map->nr_extents)
> +	if (pos < map_load_extents(map))
>  		extent = &map->extent[pos];
>  
>  	return extent;
> @@ -652,25 +688,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
>  	char *kbuf = NULL, *pos, *next_line;
>  	ssize_t ret = -EINVAL;
>  
> -	/*
> -	 * The userns_state_mutex serializes all writes to any given map.
> -	 *
> -	 * Any map is only ever written once.
> -	 *
> -	 * An id map fits within 1 cache line on most architectures.
> -	 *
> -	 * On read nothing needs to be done unless you are on an
> -	 * architecture with a crazy cache coherency model like alpha.
> -	 *
> -	 * There is a one time data dependency between reading the
> -	 * count of the extents and the values of the extents.  The
> -	 * desired behavior is to see the values of the extents that
> -	 * were written before the count of the extents.
> -	 *
> -	 * To achieve this smp_wmb() is used on guarantee the write
> -	 * order and smp_rmb() is guaranteed that we don't have crazy
> -	 * architectures returning stale data.
> -	 */
>  	mutex_lock(&userns_state_mutex);
>  
>  	ret = -EPERM;
> @@ -790,8 +807,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
>  	/* Install the map */
>  	memcpy(map->extent, new_map.extent,
>  		new_map.nr_extents*sizeof(new_map.extent[0]));
> -	smp_wmb();
> -	map->nr_extents = new_map.nr_extents;
> +
> +	map_store_extents(map, new_map.nr_extents);
>  
>  	*ppos = count;
>  	ret = count;