[v2,09/24] file: Replace fcheck_files with files_lookup_fd_rcu

Submitted by Eric W. Biederman on Nov. 20, 2020, 11:14 p.m.

Details

Message ID 20201120231441.29911-9-ebiederm@xmission.com
State New
Series "exec: Move unshare_files and guarantee files_struct.count is correct"
Headers show

Commit Message

Eric W. Biederman Nov. 20, 2020, 11:14 p.m.
This change renames fcheck_files to files_lookup_fd_rcu.  All of the
remaining callers take the rcu_read_lock before calling this function
so the _rcu suffix is appropriate.  This change also tightens up the
debug check to verify that all callers hold the rcu_read_lock.

All callers that used to call files_check with the files->file_lock
held have now been changed to call files_lookup_fd_locked.

This change of name has helped remind me of which locks and which
guarantees are in place helping me to catch bugs later in the
patchset.

The need for better names became apparent in the last round of
discussion of this set of changes[1].

[1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 Documentation/filesystems/files.rst | 6 +++---
 fs/file.c                           | 4 ++--
 fs/proc/fd.c                        | 4 ++--
 include/linux/fdtable.h             | 7 +++----
 kernel/bpf/task_iter.c              | 2 +-
 kernel/kcmp.c                       | 2 +-
 6 files changed, 12 insertions(+), 13 deletions(-)

Patch hide | download patch | download mbox

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index cbf8e57376bf..ea75acdb632c 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@  the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either fcheck() or fcheck_files() APIs. These
+   must use either fcheck() or files_lookup_fd_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -84,7 +84,7 @@  the fdtable structure -
    on ->f_count::
 
 	rcu_read_lock();
-	file = fcheck_files(files, fd);
+	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
 		if (atomic_long_inc_not_zero(&file->f_count))
 			*fput_needed = 1;
@@ -104,7 +104,7 @@  the fdtable structure -
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and fcheck()/fcheck_files() which take care of these issues.
+   and fcheck()/files_lookup_fd_rcu() which take care of these issues.
 
 7. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
diff --git a/fs/file.c b/fs/file.c
index 9d0e91168be1..5861c4f89419 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -814,7 +814,7 @@  static struct file *__fget_files(struct files_struct *files, unsigned int fd,
 
 	rcu_read_lock();
 loop:
-	file = fcheck_files(files, fd);
+	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
 		/* File object ref couldn't be taken.
 		 * dup2() atomicity guarantee is the reason
@@ -1127,7 +1127,7 @@  SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!fcheck_files(files, oldfd))
+		if (!files_lookup_fd_rcu(files, oldfd))
 			retval = -EBADF;
 		rcu_read_unlock();
 		return retval;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 2cca9bca3b3a..3dec44d7c5c5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -90,7 +90,7 @@  static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 		return false;
 
 	rcu_read_lock();
-	file = fcheck_files(files, fd);
+	file = files_lookup_fd_rcu(files, fd);
 	if (file)
 		*mode = file->f_mode;
 	rcu_read_unlock();
@@ -243,7 +243,7 @@  static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = fcheck_files(files, fd);
+		f = files_lookup_fd_rcu(files, fd);
 		if (!f)
 			continue;
 		data.mode = f->f_mode;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fda4b81dd735..fa8c402a7790 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,10 +98,9 @@  static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
+static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
-			   !lockdep_is_held(&files->file_lock),
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
 			   "suspicious rcu_dereference_check() usage");
 	return files_lookup_fd_raw(files, fd);
 }
@@ -109,7 +108,7 @@  static inline struct file *fcheck_files(struct files_struct *files, unsigned int
 /*
  * Check whether the specified fd has an open file.
  */
-#define fcheck(fd)	fcheck_files(current->files, fd)
+#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)
 
 struct task_struct;
 
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 5b6af30bfbcd..5ab2ccfb96cb 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -183,7 +183,7 @@  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
 	for (; curr_fd < max_fds; curr_fd++) {
 		struct file *f;
 
-		f = fcheck_files(curr_files, curr_fd);
+		f = files_lookup_fd_rcu(curr_files, curr_fd);
 		if (!f)
 			continue;
 		if (!get_file_rcu(f))
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 87c48c0104ad..990717c1aed3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -67,7 +67,7 @@  get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	rcu_read_lock();
 
 	if (task->files)
-		file = fcheck_files(task->files, idx);
+		file = files_lookup_fd_rcu(task->files, idx);
 
 	rcu_read_unlock();
 	task_unlock(task);

Comments

Al Viro Dec. 7, 2020, 10:46 p.m.
On Fri, Nov 20, 2020 at 05:14:26PM -0600, Eric W. Biederman wrote:

>  /*
>   * Check whether the specified fd has an open file.
>   */
> -#define fcheck(fd)	fcheck_files(current->files, fd)
> +#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)

Huh?
fs/file.c:1113: file = fcheck(oldfd);
	dup3(), under ->file_lock, no rcu_read_lock() in sight

fs/locks.c:2548:                f = fcheck(fd);
	fcntl_setlk(), ditto

fs/locks.c:2679:                f = fcheck(fd);
	fcntl_setlk64(), ditto

fs/notify/dnotify/dnotify.c:330:        f = fcheck(fd);
	fcntl_dirnotify(); this one _is_ under rcu_read_lock().


IOW, unless I've missed something earlier in the series, this is wrong.
Al Viro Dec. 7, 2020, 10:49 p.m.
On Mon, Dec 07, 2020 at 10:46:57PM +0000, Al Viro wrote:
> On Fri, Nov 20, 2020 at 05:14:26PM -0600, Eric W. Biederman wrote:
> 
> >  /*
> >   * Check whether the specified fd has an open file.
> >   */
> > -#define fcheck(fd)	fcheck_files(current->files, fd)
> > +#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)
> 
> Huh?
> fs/file.c:1113: file = fcheck(oldfd);
> 	dup3(), under ->file_lock, no rcu_read_lock() in sight
> 
> fs/locks.c:2548:                f = fcheck(fd);
> 	fcntl_setlk(), ditto
> 
> fs/locks.c:2679:                f = fcheck(fd);
> 	fcntl_setlk64(), ditto
> 
> fs/notify/dnotify/dnotify.c:330:        f = fcheck(fd);
> 	fcntl_dirnotify(); this one _is_ under rcu_read_lock().
> 
> 
> IOW, unless I've missed something earlier in the series, this is wrong.

I have missed something, all right.  Ignore that comment...
Al Viro Dec. 9, 2020, 4:54 p.m.
[paulmck Cc'd]

On Mon, Dec 07, 2020 at 10:49:04PM +0000, Al Viro wrote:
> On Mon, Dec 07, 2020 at 10:46:57PM +0000, Al Viro wrote:
> > On Fri, Nov 20, 2020 at 05:14:26PM -0600, Eric W. Biederman wrote:
> > 
> > >  /*
> > >   * Check whether the specified fd has an open file.
> > >   */
> > > -#define fcheck(fd)	fcheck_files(current->files, fd)
> > > +#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)
> > 
> > Huh?
> > fs/file.c:1113: file = fcheck(oldfd);
> > 	dup3(), under ->file_lock, no rcu_read_lock() in sight
> > 
> > fs/locks.c:2548:                f = fcheck(fd);
> > 	fcntl_setlk(), ditto
> > 
> > fs/locks.c:2679:                f = fcheck(fd);
> > 	fcntl_setlk64(), ditto
> > 
> > fs/notify/dnotify/dnotify.c:330:        f = fcheck(fd);
> > 	fcntl_dirnotify(); this one _is_ under rcu_read_lock().
> > 
> > 
> > IOW, unless I've missed something earlier in the series, this is wrong.
> 
> I have missed something, all right.  Ignore that comment...

Actually, I take that back - the use of fcheck() in dnotify _is_ interesting.
At the very least it needs to be commented upon; what that code is trying
to prevent is a race between fcntl_dirnotify() and close(2)/dup2(2) closing
the descriptor in question.  The problem is, dnotify marks are removed
when we detach from descriptor table; that's done by filp_close() calling
dnotify_flush().

Suppose fcntl(fd, F_NOTIFY, 0) in one thread races with close(fd) in another
(both sharing the same descriptor table).  If the former had created and
inserted a mark *after* the latter has gotten past dnotify_flush(), there
would be nothing to evict that mark.

That's the reason that fcheck() is there.  rcu_read_lock() used to be
sufficient, but the locking has changed since then and even if it is
still enough, that's not at all obvious.

Exclusion is not an issue; barriers, OTOH...  Back then we had
->i_lock taken both by dnotify_flush() before any checks and
by fcntl_dirnotify() around the fcheck+insertion.  So on close
side we had
	store NULL into descriptor table
	acquire ->i_lock
	fetch ->i_dnotify
	...
	release ->i_lock
while on fcntl() side we had
	acquire ->i_lock
	fetch from descriptor table, sod off if not our file
	...
	store ->i_dnotify
	...
	release ->i_lock
Storing NULL into descriptor table could get reordered into
->i_lock-protected area in dnotify_flush(), but it could not
get reordered past releasing ->i_lock.  So fcntl_dirnotify()
either grabbed ->i_lock before dnotify_flush() (in which case
missing the store of NULL into descriptor table wouldn't
matter, since dnotify_flush() would've observed the mark
we'd inserted) or it would've seen that store to descriptor
table.

Nowadays it's nowhere near as straightforward; in fcntl_dirnotify()
we have
        /* this is needed to prevent the fcntl/close race described below */
        mutex_lock(&dnotify_group->mark_mutex);
and it would appear to be similar to the original situation, with
->mark_mutex serving in place of ->i_lock.  However, dnotify_flush()
might not take that mutex at all - it has
        fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
        if (!fsn_mark)
                return;
before grabbing that thing.  So the things are trickier - we actually
rely upon the barriers in fsnotify_find_mark().  And those are complicated.
The case when it returns non-NULL is not a problem - there we have that
mutex providing the barriers we need.  NULL can be returned in two cases:
	a) ->i_fsnotify_marks is not empty, but it contains no
dnotify marks.  In that case we have ->i_fsnotify_marks.lock acquired
and released.  By the time it gets around to fcheck(), fcntl_dirnotify() has
either found or created and inserted a dnotify mark into that list, with
->i_fsnotify_marks.lock acquired/released around the insertion, so we
are fine - either fcntl_dirnotify() gets there first (in which case
dnotify_flush() will observe it), or release of that lock by
fsnotify_find_mark() called by dnotify_flush() will serve as a barrier,
making sure that store to descriptor table will be observed.
	b) fsnotify_find_mark() (fsnotify_grab_connector() it calls,
actually) finds ->i_fsnotify_marks empty.  That's where the things
get interesting; we have
        idx = srcu_read_lock(&fsnotify_mark_srcu);
        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (!conn)
                goto out;
on dnotify_flush() side.  The matching store of fcntl_dirnotify()
side would be in fsnotify_attach_connector_to_object(), where
we have
        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
         */
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */

So we have
A:
	store NULL to descriptor table
	srcu_read_lock
	srcu_dereference fetches NULL from ->i_fsnotify_marks
vs.
B:
	cmpxchg replaces NULL with non-NULL in ->i_fsnotify_marks
	fetch from descriptor table, can't miss the store done by A

Which might be safe, but the whole thing *RELLY* needs to be discussed
in fcntl_dirnotify() in more details.  fs/notify/* guts are convoluted
enough to confuse anyone unfamiliar with them.
Paul E. McKenney Dec. 9, 2020, 5:44 p.m.
On Wed, Dec 09, 2020 at 04:54:11PM +0000, Al Viro wrote:
> [paulmck Cc'd]
> 
> On Mon, Dec 07, 2020 at 10:49:04PM +0000, Al Viro wrote:
> > On Mon, Dec 07, 2020 at 10:46:57PM +0000, Al Viro wrote:
> > > On Fri, Nov 20, 2020 at 05:14:26PM -0600, Eric W. Biederman wrote:
> > > 
> > > >  /*
> > > >   * Check whether the specified fd has an open file.
> > > >   */
> > > > -#define fcheck(fd)	fcheck_files(current->files, fd)
> > > > +#define fcheck(fd)	files_lookup_fd_rcu(current->files, fd)
> > > 
> > > Huh?
> > > fs/file.c:1113: file = fcheck(oldfd);
> > > 	dup3(), under ->file_lock, no rcu_read_lock() in sight
> > > 
> > > fs/locks.c:2548:                f = fcheck(fd);
> > > 	fcntl_setlk(), ditto
> > > 
> > > fs/locks.c:2679:                f = fcheck(fd);
> > > 	fcntl_setlk64(), ditto
> > > 
> > > fs/notify/dnotify/dnotify.c:330:        f = fcheck(fd);
> > > 	fcntl_dirnotify(); this one _is_ under rcu_read_lock().
> > > 
> > > 
> > > IOW, unless I've missed something earlier in the series, this is wrong.
> > 
> > I have missed something, all right.  Ignore that comment...
> 
> Actually, I take that back - the use of fcheck() in dnotify _is_ interesting.
> At the very least it needs to be commented upon; what that code is trying
> to prevent is a race between fcntl_dirnotify() and close(2)/dup2(2) closing
> the descriptor in question.  The problem is, dnotify marks are removed
> when we detach from descriptor table; that's done by filp_close() calling
> dnotify_flush().
> 
> Suppose fcntl(fd, F_NOTIFY, 0) in one thread races with close(fd) in another
> (both sharing the same descriptor table).  If the former had created and
> inserted a mark *after* the latter has gotten past dnotify_flush(), there
> would be nothing to evict that mark.
> 
> That's the reason that fcheck() is there.  rcu_read_lock() used to be
> sufficient, but the locking has changed since then and even if it is
> still enough, that's not at all obvious.
> 
> Exclusion is not an issue; barriers, OTOH...  Back then we had
> ->i_lock taken both by dnotify_flush() before any checks and
> by fcntl_dirnotify() around the fcheck+insertion.  So on close
> side we had
> 	store NULL into descriptor table
> 	acquire ->i_lock
> 	fetch ->i_dnotify
> 	...
> 	release ->i_lock
> while on fcntl() side we had
> 	acquire ->i_lock
> 	fetch from descriptor table, sod off if not our file
> 	...
> 	store ->i_dnotify
> 	...
> 	release ->i_lock
> Storing NULL into descriptor table could get reordered into
> ->i_lock-protected area in dnotify_flush(), but it could not
> get reordered past releasing ->i_lock.  So fcntl_dirnotify()
> either grabbed ->i_lock before dnotify_flush() (in which case
> missing the store of NULL into descriptor table wouldn't
> matter, since dnotify_flush() would've observed the mark
> we'd inserted) or it would've seen that store to descriptor
> table.
> 
> Nowadays it's nowhere near as straightforward; in fcntl_dirnotify()
> we have
>         /* this is needed to prevent the fcntl/close race described below */
>         mutex_lock(&dnotify_group->mark_mutex);
> and it would appear to be similar to the original situation, with
> ->mark_mutex serving in place of ->i_lock.  However, dnotify_flush()
> might not take that mutex at all - it has
>         fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
>         if (!fsn_mark)
>                 return;
> before grabbing that thing.  So the things are trickier - we actually
> rely upon the barriers in fsnotify_find_mark().  And those are complicated.
> The case when it returns non-NULL is not a problem - there we have that
> mutex providing the barriers we need.  NULL can be returned in two cases:
> 	a) ->i_fsnotify_marks is not empty, but it contains no
> dnotify marks.  In that case we have ->i_fsnotify_marks.lock acquired
> and released.  By the time it gets around to fcheck(), fcntl_dirnotify() has
> either found or created and inserted a dnotify mark into that list, with
> ->i_fsnotify_marks.lock acquired/released around the insertion, so we
> are fine - either fcntl_dirnotify() gets there first (in which case
> dnotify_flush() will observe it), or release of that lock by
> fsnotify_find_mark() called by dnotify_flush() will serve as a barrier,
> making sure that store to descriptor table will be observed.
> 	b) fsnotify_find_mark() (fsnotify_grab_connector() it calls,
> actually) finds ->i_fsnotify_marks empty.  That's where the things
> get interesting; we have
>         idx = srcu_read_lock(&fsnotify_mark_srcu);
>         conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
>         if (!conn)
>                 goto out;
> on dnotify_flush() side.  The matching store of fcntl_dirnotify()
> side would be in fsnotify_attach_connector_to_object(), where
> we have
>         /*
>          * cmpxchg() provides the barrier so that readers of *connp can see
>          * only initialized structure
>          */
>         if (cmpxchg(connp, NULL, conn)) {
>                 /* Someone else created list structure for us */
> 
> So we have
> A:
> 	store NULL to descriptor table
> 	srcu_read_lock
> 	srcu_dereference fetches NULL from ->i_fsnotify_marks
> vs.
> B:
> 	cmpxchg replaces NULL with non-NULL in ->i_fsnotify_marks
> 	fetch from descriptor table, can't miss the store done by A
> 
> Which might be safe, but the whole thing *RELLY* needs to be discussed
> in fcntl_dirnotify() in more details.  fs/notify/* guts are convoluted
> enough to confuse anyone unfamiliar with them.

This ordering relies on the full barrier in srcu_read_lock().  There was
a time when srcu_read_lock() did not have a full barrier, and there
might well be a time in the future when srcu_read_lock() no longer has a
full barrier.  So please add smp_mb__after_srcu_read_unlock() immediately
after the srcu_read_lock() if you are relying on that full barrier.

							Thanx, Paul