[RHEL8,COMMIT] cbt: New interface to save current mask snapshot in cbt

Submitted by Konstantin Khorenko on Oct. 25, 2019, 2:41 p.m.

Details

Message ID 201910251441.x9PEf3vh005124@finist_co8.work.ct
State New
Series "cbt: New interface to save current mask snapshot in cbt"
Headers show

Commit Message

Konstantin Khorenko Oct. 25, 2019, 2:41 p.m.
The commit is pushed to "branch-rh8-4.18.0-80.1.2.vz8.2.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-80.1.2.vz8.2.3
------>
commit c249ec2d605e6df3acf4c3bbab0f6c034b8ee953
Author: Kirill Tkhai <ktkhai@virtuozzo.com>
Date:   Fri Oct 25 17:41:03 2019 +0300

    cbt: New interface to save current mask snapshot in cbt
    
    During the backup, we want to save current changed mask
    and to start tracking from clean mask again.
    
    Previously, the mask was saved in another driver:
    ploop used to call cbt primitives and saved it in ploop
    device structures. This looks better than saving the mask
    in userspace, because the mask remains alive even in case
    of userspace death. The only thing needed after died backup
    is to merge the saved mask back from ploop driver
    to cbt driver. Thus, all changed (from previous successful
    backup) blocks are still available, and it's possible
    to create partial backup even after segfaulted userspace.
    
    This patchset continues the practice of saving mask in kernel,
    but it makes possible to save CBT snapshot in cbt driver
    without distributing CBT structures over the kernel. Here is
    a new BLKCBTMISC ioctl, which allows to create, drop and
    merge back a snapshot. The ioctl has 3 switches:
    
    * CMI_SNP_CREATE: create a new mask snapshot and move changed
    blocks mask there (changed blocks mask becomes empty after
    that).
    
    * CMI_SNP_DROP: drops created snapshot (should be called after
    successful backup).
    
    * CMI_SNP_MERGE_BACK: moves snapshot bits into changing blocks
    mask and kills snapshot (should be called after failed backup).
    
    Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 block/blk-cbt.c         | 148 +++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/blkdev.h  |   6 --
 include/uapi/linux/fs.h |  19 +++++++
 3 files changed, 152 insertions(+), 21 deletions(-)

Patch hide | download patch | download mbox

diff --git a/block/blk-cbt.c b/block/blk-cbt.c
index 761ed683ea47..32949219e5b7 100644
--- a/block/blk-cbt.c
+++ b/block/blk-cbt.c
@@ -46,6 +46,10 @@  struct cbt_info {
 	unsigned int count;
 	struct cbt_extent __percpu *cache;
 	struct page **map;
+
+	struct page **snp_map;
+	blkcnt_t snp_block_max;
+
 	spinlock_t lock;
 };
 
@@ -297,14 +301,47 @@  static struct cbt_info* do_cbt_alloc(struct request_queue *q, __u8 *uuid,
 	return ERR_PTR(-ENOMEM);
 }
 
-int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
-			  struct page ***map_ptr, blkcnt_t *block_max,
-			  blkcnt_t *block_bits)
+static void free_map(struct page **map, unsigned long npages)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; i++)
+		if (map[i])
+			__free_page(map[i]);
+	vfree(map);
+}
+
+static int copy_cbt_to_user(struct page **map, unsigned long npages, void *user_addr)
+{
+        unsigned long i;
+
+        for (i = 0; i < npages; i++) {
+                struct page *page = map[i] ? : ZERO_PAGE(0);
+
+                if (copy_to_user(user_addr, page_address(page), PAGE_SIZE))
+                        return -EFAULT;
+
+                user_addr += PAGE_SIZE;
+        }
+
+        return 0;
+}
+
+static int blk_cbt_snp_create(struct request_queue *q, __u8 *uuid,
+			      struct blk_user_cbt_snp_create __user *arg)
 {
 	struct cbt_info *cbt;
 	struct page **map;
 	unsigned long npages;
 	unsigned long i;
+	__u64 to_addr;
+	int ret;
+
+	if (copy_from_user(&to_addr, &arg->addr, sizeof(to_addr)))
+		return -EFAULT;
+
+	if ((unsigned long)to_addr != to_addr)
+		return -EINVAL;
 
 	mutex_lock(&cbt_mutex);
 	cbt = q->cbt;
@@ -322,6 +359,11 @@  int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
 		return -EINVAL;
 	}
 
+	if (cbt->snp_map) {
+		mutex_unlock(&cbt_mutex);
+		return -EBUSY;
+	}
+
 	cbt_flush_cache(cbt);
 
 	npages = NR_PAGES(cbt->block_max);
@@ -348,12 +390,13 @@  int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
 			unlock_page(page);
 		}
 	}
-	mutex_unlock(&cbt_mutex);
 
-	*map_ptr = map;
-	*block_max = cbt->block_max;
-	*block_bits = cbt->block_bits;
-	return 0;
+	cbt->snp_map = map;
+	cbt->snp_block_max = cbt->block_max;
+	ret = copy_cbt_to_user(map, npages, (void *)to_addr);
+
+	mutex_unlock(&cbt_mutex);
+	return ret;
 
 fail_pages:
 	while (--i >= 0) {
@@ -365,7 +408,43 @@  int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
 	mutex_unlock(&cbt_mutex);
 	return -ENOMEM;
 }
-EXPORT_SYMBOL(blk_cbt_map_copy_once);
+
+static int blk_cbt_snp_drop(struct request_queue *q, __u8 *uuid)
+{
+	struct cbt_info *cbt;
+	unsigned long npages;
+	struct page **map;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -ENOENT;
+	}
+
+	BUG_ON(!cbt->map);
+	BUG_ON(!cbt->block_max);
+
+	if (!uuid || memcmp(uuid, cbt->uuid, sizeof(cbt->uuid))) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+
+	map = cbt->snp_map;
+	if (!map) {
+		mutex_unlock(&cbt_mutex);
+		return -ENOENT;
+	}
+	cbt->snp_map = NULL;
+
+	npages = NR_PAGES(cbt->snp_block_max);
+	cbt->snp_block_max = 0;
+	mutex_unlock(&cbt_mutex);
+
+	free_map(map, npages);
+	return 0;
+}
 
 static void blk_cbt_page_merge(struct page *pg_from, struct page *pg_to)
 {
@@ -380,11 +459,11 @@  static void blk_cbt_page_merge(struct page *pg_from, struct page *pg_to)
 	}
 }
 
-int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
-		      struct page **map, blkcnt_t block_max,
-		      blkcnt_t block_bits)
+static int blk_cbt_snp_merge_back(struct request_queue *q, __u8 *uuid)
 {
 	struct cbt_info *cbt;
+	blkcnt_t block_max;
+	struct page **map;
 	unsigned long i;
 
 	mutex_lock(&cbt_mutex);
@@ -398,8 +477,11 @@  int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
+	map = cbt->snp_map;
+	block_max = cbt->snp_block_max;
+
 	if (!map || !uuid || memcmp(uuid, cbt->uuid, sizeof(cbt->uuid)) ||
-	    block_max != cbt->block_max || block_bits != cbt->block_bits) {
+	    block_max != cbt->block_max) {
 		mutex_unlock(&cbt_mutex);
 		return -EINVAL;
 	}
@@ -429,10 +511,14 @@  int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
 		blk_cbt_page_merge(page_addon, page_main);
 		unlock_page(page_main);
 	}
+
+	cbt->snp_map = NULL;
+	cbt->snp_block_max = 0;
 	mutex_unlock(&cbt_mutex);
+
+	free_map(map, NR_PAGES(block_max));
 	return 0;
 }
-EXPORT_SYMBOL(blk_cbt_map_merge);
 
 void blk_cbt_update_size(struct block_device *bdev)
 {
@@ -522,8 +608,13 @@  static void cbt_release_callback(struct rcu_head *head)
 	for (i = 0; i < nr_pages; i++)
 		if (CBT_PAGE(cbt, i))
 			__free_page(CBT_PAGE(cbt, i));
-
 	vfree(cbt->map);
+
+	if (cbt->snp_map) {
+		nr_pages = NR_PAGES(cbt->snp_block_max);
+		free_map(cbt->snp_map, nr_pages);
+	}
+
 	free_percpu(cbt->cache);
 	kfree(cbt);
 }
@@ -829,6 +920,28 @@  static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
 	return ret;
 }
 
+static int cbt_ioc_misc(struct block_device *bdev, void __user *arg)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct blk_user_cbt_misc_info cmi;
+
+	if (copy_from_user(&cmi, arg, sizeof(cmi)))
+		return -EFAULT;
+
+	switch (cmi.action) {
+	case CMI_SNP_CREATE:
+		return blk_cbt_snp_create(q, cmi.uuid, arg);
+	case CMI_SNP_DROP:
+		return blk_cbt_snp_drop(q, cmi.uuid);
+	case CMI_SNP_MERGE_BACK:
+		return blk_cbt_snp_merge_back(q, cmi.uuid);
+	default:
+		return -ENOTSUPP;
+	}
+
+	return 0;
+}
+
 int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 {
 	struct blk_user_cbt_info __user *ucbt_ioc = (struct blk_user_cbt_info __user *) arg;
@@ -855,6 +968,11 @@  int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 			return -EACCES;
 
 		return cbt_ioc_set(bdev, ucbt_ioc, 0);
+	case BLKCBTMISC:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_misc(bdev, arg);
 	default:
 		BUG();
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f7cbe3088873..85739fd0e298 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1666,12 +1666,6 @@  extern void blk_cbt_update_size(struct block_device *bdev);
 extern void blk_cbt_release(struct request_queue *q);
 extern void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio);
 extern int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg);
-extern int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
-				 struct page ***map_ptr, blkcnt_t *block_max,
-				 blkcnt_t *block_bits);
-extern int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
-			     struct page **map, blkcnt_t block_max,
-			     blkcnt_t block_bits);
 #else /* CONFIG_BLK_DEV_CBT */
 static inline void blk_cbt_update_size(struct block_device *bdev)
 {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f05175b1cdf7..f30b87dc0c31 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -259,11 +259,30 @@  enum CI_FLAGS
 	CI_FLAG_NEW_UUID = 2 /* BLKCBTSET update uuid */
 };
 
+/* Extension of cbt ioctls:  */
+struct blk_user_cbt_misc_info {
+	__u8 uuid[16]; /* Bitmap UUID */
+/* Allocate and move pending map to CBT snapshot */
+#define CMI_SNP_CREATE		0
+/* Drop CBT snapshot */
+#define CMI_SNP_DROP		1
+/* Merge CBT snapshot bits back and drop CBT snapshot */
+#define CMI_SNP_MERGE_BACK	2
+	__u64 action;
+	__u8 data[0];
+};
+
+struct blk_user_cbt_snp_create {
+	struct blk_user_cbt_misc_info cmi;
+	__u64 addr;
+};
+
 #define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
 #define BLKCBTSTOP _IO(0x12,201)
 #define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
 #define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
 #define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
+#define BLKCBTMISC _IOWR(0x12,205,struct blk_user_cbt_misc_info)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */