[RHEL7,COMMIT] ms/jbd2: add support for avoiding data writes during transaction commits

Submitted by Konstantin Khorenko on Oct. 25, 2019, 10:03 p.m.

Details

Message ID 201910252203.x9PM3IjQ006880@finist-ce7.sw.ru
State New
Series "ms/jbd2: add support for avoiding data writes during transaction commits"
Headers show

Commit Message

Konstantin Khorenko Oct. 25, 2019, 10:03 p.m.
The commit is pushed to "branch-rh7-3.10.0-1062.1.2.vz7.114.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1062.1.2.vz7.114.10
------>
commit ecc4310112d7c3fa39be7237ac0f1e50308e6494
Author: Jan Kara <jack@suse.cz>
Date:   Sat Oct 26 01:03:18 2019 +0300

    ms/jbd2: add support for avoiding data writes during transaction commits
    
    Currently when filesystem needs to make sure data is on permanent
    storage before committing a transaction it adds inode to transaction's
    inode list. During transaction commit, jbd2 writes back all dirty
    buffers that have allocated underlying blocks and waits for the IO to
    finish. However when doing writeback for delayed allocated data, we
    allocate blocks and immediately submit the data. Thus asking jbd2 to
    write dirty pages just unnecessarily adds more work to jbd2 possibly
    writing back other redirtied blocks.
    
    Add support to jbd2 to allow filesystem to ask jbd2 to only wait for
    outstanding data writes before committing a transaction and thus avoid
    unnecessary writes.
    
    Signed-off-by: Jan Kara <jack@suse.cz>
    Signed-off-by: Theodore Ts'o <tytso@mit.edu>
    (cherry picked from commit 41617e1a8dec9fe082ba5dec26bacb154eb55482)
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 fs/ext4/ext4_jbd2.h   |  3 ++-
 fs/jbd2/commit.c      |  4 ++++
 fs/jbd2/journal.c     |  3 ++-
 fs/jbd2/transaction.c | 22 ++++++++++++++++++----
 fs/ocfs2/journal.h    |  2 +-
 include/linux/jbd2.h  | 13 +++++++++++--
 6 files changed, 38 insertions(+), 9 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 17c00ff202f2..4688225d8b23 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -364,7 +364,8 @@  static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+		return jbd2_journal_inode_add_write(handle,
+						    EXT4_I(inode)->jinode);
 	return 0;
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index fd83a2abed7e..06f1c8697f27 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -223,6 +223,8 @@  static int journal_submit_data_buffers(journal_t *journal,
 
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		if (!(jinode->i_flags & JI_WRITE_DATA))
+			continue;
 		mapping = jinode->i_vfs_inode->i_mapping;
 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		spin_unlock(&journal->j_list_lock);
@@ -260,6 +262,8 @@  static int journal_finish_inode_data_buffers(journal_t *journal,
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		if (!(jinode->i_flags & JI_WAIT_DATA))
+			continue;
 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait_keep_errors(
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9a128e1d719c..977e8a4b5088 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,7 +94,8 @@  EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 9a7d1a57f4f7..55003633997c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2444,7 +2444,8 @@  void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 /*
  * File inode in the inode list of the handle's transaction
  */
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+				   unsigned long flags)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
@@ -2469,12 +2470,14 @@  int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 	 * and if jinode->i_next_transaction == transaction, commit code
 	 * will only file the inode where we want it.
 	 */
-	if (jinode->i_transaction == transaction ||
-	    jinode->i_next_transaction == transaction)
+	if ((jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction) &&
+	    (jinode->i_flags & flags) == flags)
 		return 0;
 
 	spin_lock(&journal->j_list_lock);
-
+	jinode->i_flags |= flags;
+	/* Is inode already attached where we need it? */
 	if (jinode->i_transaction == transaction ||
 	    jinode->i_next_transaction == transaction)
 		goto done;
@@ -2505,6 +2508,17 @@  int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 	return 0;
 }
 
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+	return jbd2_journal_file_inode(handle, jinode,
+				       JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
 /*
  * File truncate and transaction commit interact with each other in a
  * non-trivial way.  If a transaction writing data block A is
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..ce007f610f71 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -605,7 +605,7 @@  static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 
 static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+	return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index e908348fe348..06d7a6febe5b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -367,11 +367,19 @@  BUFFER_FNS(Verified, verified)
 
 /* Flags in jbd_inode->i_flags */
 #define __JI_COMMIT_RUNNING 0
-/* Commit of the inode data in progress. We use this flag to protect us from
+#define __JI_WRITE_DATA 1
+#define __JI_WAIT_DATA 2
+
+/*
+ * Commit of the inode data in progress. We use this flag to protect us from
  * concurrent deletion of inode. We cannot use reference to inode for this
  * since we cannot afford doing last iput() on behalf of kjournald
  */
 #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+/* Write allocated dirty buffers in this inode before commit */
+#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
+/* Wait for outstanding data writes for this inode before commit */
+#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
 
 /**
  * struct jbd_inode is the structure linking inodes in ordered mode
@@ -1170,7 +1178,8 @@  extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_force_commit_nested(journal_t *);
-extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
 extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
 				struct jbd2_inode *inode, loff_t new_size);
 extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);