[RHEL7,COMMIT] ploop: Provide interface to pass freezing fs during snapshot

Submitted by Vasily Averin on Sept. 15, 2020, 11:12 a.m.

Details

Message ID 202009151112.08FBCLrT021036@vz7build.vvs.sw.ru
State New
Series "ploop: Provide interface to pass freezing fs during snapshot"
Headers show

Commit Message

Vasily Averin Sept. 15, 2020, 11:12 a.m.
The commit is pushed to "branch-rh7-3.10.0-1127.18.2.vz7.163.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.18.2.vz7.163.22
------>
commit 56e1c4d968993c8985bb90d983cb563a43d32010
Author: Kirill Tkhai <ktkhai@virtuozzo.com>
Date:   Tue Sep 15 14:12:20 2020 +0300

    ploop: Provide interface to pass freezing fs during snapshot
    
    Userspace passes PLOOP_FLAG_FS_SYNC flag, when it wants
    to sync filesystem on snapshot. We need that because
    of ext4 may start transaction in one snapshot, and then
    the transaction metadata will actually be written in another
    snapshot.
    
    So, later, if we jump to old snapshot (snapshot-switch),
    ext4 mount code sees that transaction is in progress,
    and it tries to replay journal (it tries to write metadata
    to appropriate places like it sees it has to be according
    to journal).
    
    This is a reason of several problems such as impossibility
    to jump to old snapshot in RO. But not only that.
    
    To workaround that, we have PLOOP_FLAG_FS_SYNC, but it does
    not work with any configurations of device-mapper stack.
    Device mapper is supported by autodetect
    (see ploop_set_dm_crypt_bdev()) This autodetect supports
    only one configuration:
    
    ploop->dm_crypt->fs,
    
    while now we have another:
    
    ploop->dm_crypt->linear->fs.
    
    Instead of introducing new crutches in the autodetect,
    we better introduce a generic interface, which allows
    userspace to pass fd of a mountpoint we want to sync
    via the same ioctl. So, we won't need more crutches
    in autodetect, if configuration will change in the future.
    Userspace will just pass appropriate fd, while kernel
    remains the same.
    
    We reuse second chunk of struct ploop_ctl for fd of
    mountpoint we want to sync/freeze. In case we want to freeze
    several mount points in the future, we may reuse more
    chunks for that.
    
    (
    	We could introduce another interface, say,
    
    	struct ploop_snapshot
    	{
    		struct ploop_ctl ctl;
    		struct ploop_ctl_chunk chunk;
    		int mountpoints_fds[0]
    	}
    
    	but it requires a new flag in struct ploop_ctl::pctl_flags,
    	and it looks worse in general.
    )
    
    Userspace will able to detect, whether kernel supports
    passing mountpoint fd by a test request with ctl.pctl_chunks == 0.
    Old kernel returns -EINVAL, new kernel -ENOTSUPP.
    
    https://jira.sw.ru/browse/PSBM-105630
    Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 drivers/block/ploop/dev.c | 64 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 56 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 113394d..24e69e0 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -3643,14 +3643,46 @@  void ploop_relax(struct ploop_device * plo)
 }
 EXPORT_SYMBOL(ploop_relax);
 
+static int get_bdev_from_fd(int fd, struct block_device **bdev)
+{
+	struct file *file = fget(fd);
+	struct inode *inode;
+	int ret = -ENODEV;
+
+	if (!file)
+		return -ENODEV;
+
+	inode = file_inode(file);
+	if (!inode)
+		goto fput;
+
+	*bdev = inode->i_sb->s_bdev;
+	if (*bdev) {
+		bdgrab(*bdev);
+		ret = 0;
+	}
+fput:
+	fput(file);
+	return ret;
+}
+
 /* search disk for first partition bdev with mounted fs and freeze it */
 static struct super_block *find_and_freeze_bdev(struct ploop_device *plo,
-						struct block_device ** bdev_pp)
+						int sync_fd,
+						struct block_device **bdev_pp)
 {
 	struct super_block  * sb   = NULL;
 	struct block_device * bdev = NULL;
 	struct gendisk *disk = plo->disk;
-	int i;
+	int i, ret;
+
+	if (sync_fd != -1) {
+		ret = get_bdev_from_fd(sync_fd, &bdev);
+		if (ret)
+			return ERR_PTR(ret);
+		sb = freeze_bdev(bdev);
+		goto out;
+	}
 
 	bdev = ploop_get_dm_crypt_bdev(plo);
 	if (bdev) {
@@ -3683,12 +3715,12 @@  out:
 static int ploop_snapshot(struct ploop_device * plo, unsigned long arg,
 			  struct block_device * bdev)
 {
-	int err;
 	struct ploop_ctl ctl;
-	struct ploop_ctl_chunk chunk;
+	struct ploop_ctl_chunk chunk, __user *uchunks;
 	struct ploop_delta * delta, * top_delta;
 	struct ploop_snapdata snapdata;
 	struct super_block * sb;
+	int err, sync_fd = -1;
 
 	if (list_empty(&plo->map.delta_list))
 		return -ENOENT;
@@ -3700,10 +3732,26 @@  static int ploop_snapshot(struct ploop_device * plo, unsigned long arg,
 
 	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
 		return -EFAULT;
+	uchunks = (void *)((void*)arg + sizeof(struct ploop_ctl));
+	if (ctl.pctl_chunks == 2) {
+		/*
+		 * We reused second chunk for fd of mountpoint to sync.
+		 * In case of we need to support more mountpoints,
+		 * we may reuse more chunks (chunks below #0 are mountpoints).
+		 */
+		if (copy_from_user(&chunk, &uchunks[1],
+				   sizeof(struct ploop_ctl_chunk)))
+			return -EFAULT;
+		if (chunk.pctl_type || chunk.pctl_flags || chunk.pctl_offset ||
+		    chunk.pctl_start || chunk.pctl_len)
+			return -EINVAL;
+		/* The rest of fields are ignored */
+		sync_fd = chunk.pctl_fd;
+		ctl.pctl_chunks = 1;
+	}
 	if (ctl.pctl_chunks != 1)
-		return -EINVAL;
-	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
-			   sizeof(struct ploop_ctl_chunk)))
+		return -ENOTSUPP;
+	if (copy_from_user(&chunk, uchunks, sizeof(struct ploop_ctl_chunk)))
 		return -EFAULT;
 
 	delta = init_delta(plo, &ctl, -1);
@@ -3739,7 +3787,7 @@  static int ploop_snapshot(struct ploop_device * plo, unsigned long arg,
 		/* freeze_bdev() may trigger ploop_bd_full() */
 		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
 		mutex_unlock(&plo->ctl_mutex);
-		sb = find_and_freeze_bdev(plo, &bdev);
+		sb = find_and_freeze_bdev(plo, sync_fd, &bdev);
 		mutex_lock(&plo->ctl_mutex);
 		plo->maintenance_type = PLOOP_MNTN_OFF;
 		if (IS_ERR(sb)) {