[RHEL7,COMMIT] ms/fsnotify: Fix busy inodes during unmount

Submitted by Konstantin Khorenko on July 1, 2019, 9:23 a.m.

Details

Message ID 201907010923.x619N5pb032354@finist-ce7.sw.ru
State New
Series "ms/fsnotify: Fix busy inodes during unmount"
Headers show

Commit Message

Konstantin Khorenko July 1, 2019, 9:23 a.m.
The commit is pushed to "branch-rh7-3.10.0-957.21.3.vz7.106.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-957.21.3.vz7.106.3
------>
commit 497015bfeb8feb4dad5a5477a996268956f0637a
Author: Jan Kara <jack@suse.cz>
Date:   Mon Jul 1 12:23:05 2019 +0300

    ms/fsnotify: Fix busy inodes during unmount
    
    Detaching of mark connector from fsnotify_put_mark() can race with
    unmounting of the filesystem like:
    
      CPU1                          CPU2
    fsnotify_put_mark()
      spin_lock(&conn->lock);
      ...
      inode = fsnotify_detach_connector_from_object(conn)
      spin_unlock(&conn->lock);
                                    generic_shutdown_super()
                                      fsnotify_unmount_inodes()
                                        sees connector detached for inode
                                          -> nothing to do
                                      evict_inode()
                                        barfs on pending inode reference
      iput(inode);
    
    Resulting in "Busy inodes after unmount" message and possible kernel
    oops. Make fsnotify_unmount_inodes() properly wait for outstanding inode
    references from detached connectors.
    
    Note that the accounting of outstanding inode references in the
    superblock can cause some cacheline contention on the counter. OTOH it
    happens only during deletion of the last notification mark from an inode
    (or during unlinking of watched inode) and that is not too bad. I have
    measured time to create & delete inotify watch 100000 times from 64
    processes in parallel (each process having its own inotify group and its
    own file on a shared superblock) on a 64 CPU machine. Average and
    standard deviation of 15 runs look like:
    
            Avg             Stddev
    Vanilla 9.817400        0.276165
    Fixed   9.710467        0.228294
    
    So there's no statistically significant difference.
    
    Fixes: 6b3f05d24d35 ("fsnotify: Detach mark from object list when last reference is dropped")
    CC: stable@vger.kernel.org
    Signed-off-by: Jan Kara <jack@suse.cz>
    
    https://jira.sw.ru/browse/PSBM-95177
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 fs/notify/fsnotify.c |  3 +++
 fs/notify/mark.c     | 39 +++++++++++++++++++++++++++++++--------
 include/linux/fs.h   |  3 +++
 3 files changed, 37 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d09f72b3a790..2501a2af6b10 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -96,6 +96,9 @@  void fsnotify_unmount_inodes(struct super_block *sb)
 
 	if (iput_inode)
 		iput(iput_inode);
+	/* Wait for outstanding inode references from connectors */
+	wait_var_event(&sb->s_fsnotify_inode_refs,
+		       !atomic_long_read(&sb->s_fsnotify_inode_refs));
 }
 
 /*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 258d99087183..30f289c3912e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -161,17 +161,20 @@  static void fsnotify_connector_destroy_workfn(struct work_struct *work)
 	}
 }
 
-static struct inode *fsnotify_detach_connector_from_object(
-					struct fsnotify_mark_connector *conn)
+static void *fsnotify_detach_connector_from_object(
+					struct fsnotify_mark_connector *conn,
+					unsigned int *flags)
 {
 	struct inode *inode = NULL;
 
+	*flags = conn->flags;
 	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
 		inode = conn->inode;
 		rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
 		inode->i_fsnotify_mask = 0;
 		conn->inode = NULL;
 		conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
+		atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
 	} else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
 		rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks,
 				   NULL);
@@ -193,10 +196,29 @@  static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
 	fsnotify_put_group(group);
 }
 
+/* Drop object reference originally held by a connector */
+static void fsnotify_drop_object(unsigned int flags, void *objp)
+{
+	struct inode *inode;
+	struct super_block *sb;
+
+	if (!objp)
+		return;
+	/* Currently only inode references are passed to be dropped */
+	if (WARN_ON_ONCE(!(flags & FSNOTIFY_OBJ_TYPE_INODE)))
+		return;
+	inode = objp;
+	sb = inode->i_sb;
+	iput(inode);
+	if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
+		wake_up_var(&sb->s_fsnotify_inode_refs);
+}
+
 void fsnotify_put_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_mark_connector *conn;
-	struct inode *inode = NULL;
+	void *objp = NULL;
+	unsigned int flags = 0;
 	bool free_conn = false;
 
 	/* Catch marks that were actually never attached to object */
@@ -216,7 +238,7 @@  void fsnotify_put_mark(struct fsnotify_mark *mark)
 	conn = mark->connector;
 	hlist_del_init_rcu(&mark->obj_list);
 	if (hlist_empty(&conn->list)) {
-		inode = fsnotify_detach_connector_from_object(conn);
+		objp = fsnotify_detach_connector_from_object(conn, &flags);
 		free_conn = true;
 	} else {
 		__fsnotify_recalc_mask(conn);
@@ -224,7 +246,7 @@  void fsnotify_put_mark(struct fsnotify_mark *mark)
 	mark->connector = NULL;
 	spin_unlock(&conn->lock);
 
-	iput(inode);
+	fsnotify_drop_object(flags, objp);
 
 	if (free_conn) {
 		spin_lock(&destroy_lock);
@@ -694,7 +716,8 @@  void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
 {
 	struct fsnotify_mark_connector *conn;
 	struct fsnotify_mark *mark, *old_mark = NULL;
-	struct inode *inode;
+	void *objp;
+	unsigned int flags;
 
 	conn = fsnotify_grab_connector(connp);
 	if (!conn)
@@ -720,11 +743,11 @@  void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
 	 * mark references get dropped. It would lead to strange results such
 	 * as delaying inode deletion or blocking unmount.
 	 */
-	inode = fsnotify_detach_connector_from_object(conn);
+	objp = fsnotify_detach_connector_from_object(conn, &flags);
 	spin_unlock(&conn->lock);
 	if (old_mark)
 		fsnotify_put_mark(old_mark);
-	iput(inode);
+	fsnotify_drop_object(flags, objp);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67262dadfa01..d8195b005b1d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1739,6 +1739,9 @@  struct super_block {
 	RH_KABI_EXTEND(unsigned long	s_iflags)
 	RH_KABI_EXTEND(struct user_namespace *s_user_ns)
 
+	/* Pending fsnotify inode refs */
+	atomic_long_t s_fsnotify_inode_refs;
+
 	/*
 	 * Keep the lru lists last in the structure so they always sit on their
 	 * own individual cachelines.