[Devel,vz7] md: propagate QUEUE_FLAG_SG_GAPS

Submitted by Maxim Patlasov on Dec. 9, 2016, 5:17 a.m.

Details

Message ID 148126051878.4972.9055285145385534351.stgit@maxim-thinkpad
State New
Series "md: propagate QUEUE_FLAG_SG_GAPS"
Headers show

Commit Message

Maxim Patlasov Dec. 9, 2016, 5:17 a.m.
NVMe disk driver disables sg gaps by setting the flag in its queue.
md-raids must propagate the flag to its queue as well. Otherwise, an upper
user will get OK from bio_add_page() even if a gap exists, and that bio
with a gap will crash NVMe driver who doesn't expect gaps.
(see bvec_gap_to_prev for the definotion of gap)

https://jira.sw.ru/browse/PSBM-56838

Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
---
 drivers/md/linear.c    |    9 +++++++++
 drivers/md/raid0.c     |    8 ++++++++
 drivers/md/raid1.c     |   11 +++++++++++
 drivers/md/raid10.c    |   12 ++++++++++++
 drivers/md/raid5.c     |   10 ++++++++++
 include/linux/blkdev.h |    1 +
 6 files changed, 51 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3310b59..e57b8ff 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -128,6 +128,7 @@  static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	struct md_rdev *rdev;
 	int i, cnt;
 	bool discard_supported = false;
+	bool sg_gaps_disabled = false;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
 			GFP_KERNEL);
@@ -163,6 +164,9 @@  static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+
+		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+			sg_gaps_disabled = true;
 	}
 	if (cnt != raid_disks) {
 		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
@@ -175,6 +179,11 @@  static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 
+	if (!sg_gaps_disabled)
+		queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
+
 	/*
 	 * Here we calculate the device offsets.
 	 */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 100ef23..2b77d0f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -435,6 +435,7 @@  static int raid0_run(struct mddev *mddev)
 	if (mddev->queue) {
 		struct md_rdev *rdev;
 		bool discard_supported = false;
+		bool sg_gaps_disabled = false;
 
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
@@ -449,6 +450,8 @@  static int raid0_run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 				discard_supported = true;
+			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+				sg_gaps_disabled = true;
 		}
 
 		/* Unfortunately, some devices have awful discard performance,
@@ -470,6 +473,11 @@  static int raid0_run(struct mddev *mddev)
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 		else
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
+		if (!sg_gaps_disabled)
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
+		else
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
 	}
 
 	/* calculate array device size */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b45b64c..a1763b5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1663,6 +1663,8 @@  static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	md_integrity_add_rdev(rdev, mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
 	print_conf(conf);
 	return err;
 }
@@ -2905,6 +2907,7 @@  static int run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	int ret;
 	bool discard_supported = false;
+	bool sg_gaps_disabled = false;
 
 	if (mddev->level != 1) {
 		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2939,6 +2942,8 @@  static int run(struct mddev *mddev)
 				  rdev->data_offset << 9);
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+			sg_gaps_disabled = true;
 	}
 
 	mddev->degraded = 0;
@@ -2976,6 +2981,12 @@  static int run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
+		if (sg_gaps_disabled)
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
+						  mddev->queue);
 	}
 
 	ret =  md_integrity_register(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f0c7f3b..52f8d73 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1865,6 +1865,8 @@  static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	md_integrity_add_rdev(rdev, mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
 
 	print_conf(conf);
 	return err;
@@ -3650,6 +3652,7 @@  static int run(struct mddev *mddev)
 	sector_t min_offset_diff = 0;
 	int first = 1;
 	bool discard_supported = false;
+	bool sg_gaps_disabled = false;
 
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
@@ -3717,6 +3720,9 @@  static int run(struct mddev *mddev)
 
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+
+		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+			sg_gaps_disabled = true;
 	}
 
 	if (mddev->queue) {
@@ -3726,6 +3732,12 @@  static int run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
+		if (sg_gaps_disabled)
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
+						  mddev->queue);
 	}
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 50902ad1..862b86f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6814,6 +6814,7 @@  static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		int chunk_size;
 		bool discard_supported = true;
+		bool sg_gaps_disabled = false;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -6878,6 +6879,8 @@  static int run(struct mddev *mddev)
 				}
 				discard_supported = false;
 			}
+			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+				sg_gaps_disabled = true;
 		}
 
 		if (discard_supported &&
@@ -6888,6 +6891,13 @@  static int run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						mddev->queue);
+
+		if (sg_gaps_disabled)
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
 	}
 
 	return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8b400c6..e3752b1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -639,6 +639,7 @@  static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_sg_gaps(q)	test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \

Comments

Vasily Averin Dec. 9, 2016, 5:21 a.m.
Maxim,
will you sent this to mainline too ?

thank you,
	Vasily Averin

On 12/09/2016 08:17 AM, Maxim Patlasov wrote:
> NVMe disk driver disables sg gaps by setting the flag in its queue.
> md-raids must propagate the flag to its queue as well. Otherwise, an upper
> user will get OK from bio_add_page() even if a gap exists, and that bio
> with a gap will crash NVMe driver who doesn't expect gaps.
> (see bvec_gap_to_prev for the definotion of gap)
> 
> https://jira.sw.ru/browse/PSBM-56838
> 
> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
> ---
>  drivers/md/linear.c    |    9 +++++++++
>  drivers/md/raid0.c     |    8 ++++++++
>  drivers/md/raid1.c     |   11 +++++++++++
>  drivers/md/raid10.c    |   12 ++++++++++++
>  drivers/md/raid5.c     |   10 ++++++++++
>  include/linux/blkdev.h |    1 +
>  6 files changed, 51 insertions(+)
> 
> diff --git a/drivers/md/linear.c b/drivers/md/linear.c
> index 3310b59..e57b8ff 100644
> --- a/drivers/md/linear.c
> +++ b/drivers/md/linear.c
> @@ -128,6 +128,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>  	struct md_rdev *rdev;
>  	int i, cnt;
>  	bool discard_supported = false;
> +	bool sg_gaps_disabled = false;
>  
>  	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
>  			GFP_KERNEL);
> @@ -163,6 +164,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>  
>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  			discard_supported = true;
> +
> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +			sg_gaps_disabled = true;
>  	}
>  	if (cnt != raid_disks) {
>  		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
> @@ -175,6 +179,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>  	else
>  		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>  
> +	if (!sg_gaps_disabled)
> +		queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
> +	else
> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
> +
>  	/*
>  	 * Here we calculate the device offsets.
>  	 */
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index 100ef23..2b77d0f 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -435,6 +435,7 @@ static int raid0_run(struct mddev *mddev)
>  	if (mddev->queue) {
>  		struct md_rdev *rdev;
>  		bool discard_supported = false;
> +		bool sg_gaps_disabled = false;
>  
>  		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>  		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
> @@ -449,6 +450,8 @@ static int raid0_run(struct mddev *mddev)
>  					  rdev->data_offset << 9);
>  			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  				discard_supported = true;
> +			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +				sg_gaps_disabled = true;
>  		}
>  
>  		/* Unfortunately, some devices have awful discard performance,
> @@ -470,6 +473,11 @@ static int raid0_run(struct mddev *mddev)
>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>  		else
>  			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
> +
> +		if (!sg_gaps_disabled)
> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
> +		else
> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>  	}
>  
>  	/* calculate array device size */
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index b45b64c..a1763b5 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1663,6 +1663,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	md_integrity_add_rdev(rdev, mddev);
>  	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
> +	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>  	print_conf(conf);
>  	return err;
>  }
> @@ -2905,6 +2907,7 @@ static int run(struct mddev *mddev)
>  	struct md_rdev *rdev;
>  	int ret;
>  	bool discard_supported = false;
> +	bool sg_gaps_disabled = false;
>  
>  	if (mddev->level != 1) {
>  		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
> @@ -2939,6 +2942,8 @@ static int run(struct mddev *mddev)
>  				  rdev->data_offset << 9);
>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  			discard_supported = true;
> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +			sg_gaps_disabled = true;
>  	}
>  
>  	mddev->degraded = 0;
> @@ -2976,6 +2981,12 @@ static int run(struct mddev *mddev)
>  		else
>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>  						  mddev->queue);
> +		if (sg_gaps_disabled)
> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
> +						mddev->queue);
> +		else
> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
> +						  mddev->queue);
>  	}
>  
>  	ret =  md_integrity_register(mddev);
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index f0c7f3b..52f8d73 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1865,6 +1865,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	md_integrity_add_rdev(rdev, mddev);
>  	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
> +	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>  
>  	print_conf(conf);
>  	return err;
> @@ -3650,6 +3652,7 @@ static int run(struct mddev *mddev)
>  	sector_t min_offset_diff = 0;
>  	int first = 1;
>  	bool discard_supported = false;
> +	bool sg_gaps_disabled = false;
>  
>  	if (mddev->private == NULL) {
>  		conf = setup_conf(mddev);
> @@ -3717,6 +3720,9 @@ static int run(struct mddev *mddev)
>  
>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>  			discard_supported = true;
> +
> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +			sg_gaps_disabled = true;
>  	}
>  
>  	if (mddev->queue) {
> @@ -3726,6 +3732,12 @@ static int run(struct mddev *mddev)
>  		else
>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>  						  mddev->queue);
> +		if (sg_gaps_disabled)
> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
> +						mddev->queue);
> +		else
> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
> +						  mddev->queue);
>  	}
>  	/* need to check that every block has at least one working mirror */
>  	if (!enough(conf, -1)) {
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 50902ad1..862b86f 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6814,6 +6814,7 @@ static int run(struct mddev *mddev)
>  	if (mddev->queue) {
>  		int chunk_size;
>  		bool discard_supported = true;
> +		bool sg_gaps_disabled = false;
>  		/* read-ahead size must cover two whole stripes, which
>  		 * is 2 * (datadisks) * chunksize where 'n' is the
>  		 * number of raid devices
> @@ -6878,6 +6879,8 @@ static int run(struct mddev *mddev)
>  				}
>  				discard_supported = false;
>  			}
> +			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
> +				sg_gaps_disabled = true;
>  		}
>  
>  		if (discard_supported &&
> @@ -6888,6 +6891,13 @@ static int run(struct mddev *mddev)
>  		else
>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>  						mddev->queue);
> +
> +		if (sg_gaps_disabled)
> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
> +						mddev->queue);
> +		else
> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
> +						mddev->queue);
>  	}
>  
>  	return 0;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 8b400c6..e3752b1 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -639,6 +639,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
>  #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
>  #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
>  	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
> +#define blk_queue_sg_gaps(q)	test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
>  
>  #define blk_noretry_request(rq) \
>  	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
> 
>
Maxim Patlasov Dec. 9, 2016, 5:35 a.m.
Mainline is not affected. (they re-worked the whole engine completely so 
the problem vanished auto-magically)


On 12/08/2016 09:21 PM, Vasily Averin wrote:
> Maxim,
> will you sent this to mainline too ?
>
> thank you,
> 	Vasily Averin
>
> On 12/09/2016 08:17 AM, Maxim Patlasov wrote:
>> NVMe disk driver disables sg gaps by setting the flag in its queue.
>> md-raids must propagate the flag to its queue as well. Otherwise, an upper
>> user will get OK from bio_add_page() even if a gap exists, and that bio
>> with a gap will crash NVMe driver who doesn't expect gaps.
>> (see bvec_gap_to_prev for the definotion of gap)
>>
>> https://jira.sw.ru/browse/PSBM-56838
>>
>> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
>> ---
>>   drivers/md/linear.c    |    9 +++++++++
>>   drivers/md/raid0.c     |    8 ++++++++
>>   drivers/md/raid1.c     |   11 +++++++++++
>>   drivers/md/raid10.c    |   12 ++++++++++++
>>   drivers/md/raid5.c     |   10 ++++++++++
>>   include/linux/blkdev.h |    1 +
>>   6 files changed, 51 insertions(+)
>>
>> diff --git a/drivers/md/linear.c b/drivers/md/linear.c
>> index 3310b59..e57b8ff 100644
>> --- a/drivers/md/linear.c
>> +++ b/drivers/md/linear.c
>> @@ -128,6 +128,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>   	struct md_rdev *rdev;
>>   	int i, cnt;
>>   	bool discard_supported = false;
>> +	bool sg_gaps_disabled = false;
>>   
>>   	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
>>   			GFP_KERNEL);
>> @@ -163,6 +164,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>   
>>   		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>   			discard_supported = true;
>> +
>> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +			sg_gaps_disabled = true;
>>   	}
>>   	if (cnt != raid_disks) {
>>   		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
>> @@ -175,6 +179,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>   	else
>>   		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>   
>> +	if (!sg_gaps_disabled)
>> +		queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>> +	else
>> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>> +
>>   	/*
>>   	 * Here we calculate the device offsets.
>>   	 */
>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>> index 100ef23..2b77d0f 100644
>> --- a/drivers/md/raid0.c
>> +++ b/drivers/md/raid0.c
>> @@ -435,6 +435,7 @@ static int raid0_run(struct mddev *mddev)
>>   	if (mddev->queue) {
>>   		struct md_rdev *rdev;
>>   		bool discard_supported = false;
>> +		bool sg_gaps_disabled = false;
>>   
>>   		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>>   		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>> @@ -449,6 +450,8 @@ static int raid0_run(struct mddev *mddev)
>>   					  rdev->data_offset << 9);
>>   			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>   				discard_supported = true;
>> +			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +				sg_gaps_disabled = true;
>>   		}
>>   
>>   		/* Unfortunately, some devices have awful discard performance,
>> @@ -470,6 +473,11 @@ static int raid0_run(struct mddev *mddev)
>>   			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>   		else
>>   			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>> +
>> +		if (!sg_gaps_disabled)
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>> +		else
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>   	}
>>   
>>   	/* calculate array device size */
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index b45b64c..a1763b5 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1663,6 +1663,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>   	md_integrity_add_rdev(rdev, mddev);
>>   	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>   		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>> +	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>   	print_conf(conf);
>>   	return err;
>>   }
>> @@ -2905,6 +2907,7 @@ static int run(struct mddev *mddev)
>>   	struct md_rdev *rdev;
>>   	int ret;
>>   	bool discard_supported = false;
>> +	bool sg_gaps_disabled = false;
>>   
>>   	if (mddev->level != 1) {
>>   		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
>> @@ -2939,6 +2942,8 @@ static int run(struct mddev *mddev)
>>   				  rdev->data_offset << 9);
>>   		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>   			discard_supported = true;
>> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +			sg_gaps_disabled = true;
>>   	}
>>   
>>   	mddev->degraded = 0;
>> @@ -2976,6 +2981,12 @@ static int run(struct mddev *mddev)
>>   		else
>>   			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>   						  mddev->queue);
>> +		if (sg_gaps_disabled)
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>> +		else
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						  mddev->queue);
>>   	}
>>   
>>   	ret =  md_integrity_register(mddev);
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index f0c7f3b..52f8d73 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1865,6 +1865,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>   	md_integrity_add_rdev(rdev, mddev);
>>   	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>   		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>> +	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>   
>>   	print_conf(conf);
>>   	return err;
>> @@ -3650,6 +3652,7 @@ static int run(struct mddev *mddev)
>>   	sector_t min_offset_diff = 0;
>>   	int first = 1;
>>   	bool discard_supported = false;
>> +	bool sg_gaps_disabled = false;
>>   
>>   	if (mddev->private == NULL) {
>>   		conf = setup_conf(mddev);
>> @@ -3717,6 +3720,9 @@ static int run(struct mddev *mddev)
>>   
>>   		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>   			discard_supported = true;
>> +
>> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +			sg_gaps_disabled = true;
>>   	}
>>   
>>   	if (mddev->queue) {
>> @@ -3726,6 +3732,12 @@ static int run(struct mddev *mddev)
>>   		else
>>   			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>   						  mddev->queue);
>> +		if (sg_gaps_disabled)
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>> +		else
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						  mddev->queue);
>>   	}
>>   	/* need to check that every block has at least one working mirror */
>>   	if (!enough(conf, -1)) {
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 50902ad1..862b86f 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -6814,6 +6814,7 @@ static int run(struct mddev *mddev)
>>   	if (mddev->queue) {
>>   		int chunk_size;
>>   		bool discard_supported = true;
>> +		bool sg_gaps_disabled = false;
>>   		/* read-ahead size must cover two whole stripes, which
>>   		 * is 2 * (datadisks) * chunksize where 'n' is the
>>   		 * number of raid devices
>> @@ -6878,6 +6879,8 @@ static int run(struct mddev *mddev)
>>   				}
>>   				discard_supported = false;
>>   			}
>> +			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +				sg_gaps_disabled = true;
>>   		}
>>   
>>   		if (discard_supported &&
>> @@ -6888,6 +6891,13 @@ static int run(struct mddev *mddev)
>>   		else
>>   			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>   						mddev->queue);
>> +
>> +		if (sg_gaps_disabled)
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>> +		else
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>>   	}
>>   
>>   	return 0;
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 8b400c6..e3752b1 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -639,6 +639,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
>>   #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
>>   #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
>>   	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
>> +#define blk_queue_sg_gaps(q)	test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
>>   
>>   #define blk_noretry_request(rq) \
>>   	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
>>
>>
Vasily Averin Dec. 9, 2016, 5:35 a.m.
Maxim,

as far as I understand it cannot be used for live-patching:
it ssems for me .run is called on start of mdraid thread only.
Can you propose some live-patch-ready variant of this fix?

thank you,
	Vasily Averin


On 12/09/2016 08:21 AM, Vasily Averin wrote:
> Maxim,
> will you sent this to mainline too ?
> 
> thank you,
> 	Vasily Averin
> 
> On 12/09/2016 08:17 AM, Maxim Patlasov wrote:
>> NVMe disk driver disables sg gaps by setting the flag in its queue.
>> md-raids must propagate the flag to its queue as well. Otherwise, an upper
>> user will get OK from bio_add_page() even if a gap exists, and that bio
>> with a gap will crash NVMe driver who doesn't expect gaps.
>> (see bvec_gap_to_prev for the definotion of gap)
>>
>> https://jira.sw.ru/browse/PSBM-56838
>>
>> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
>> ---
>>  drivers/md/linear.c    |    9 +++++++++
>>  drivers/md/raid0.c     |    8 ++++++++
>>  drivers/md/raid1.c     |   11 +++++++++++
>>  drivers/md/raid10.c    |   12 ++++++++++++
>>  drivers/md/raid5.c     |   10 ++++++++++
>>  include/linux/blkdev.h |    1 +
>>  6 files changed, 51 insertions(+)
>>
>> diff --git a/drivers/md/linear.c b/drivers/md/linear.c
>> index 3310b59..e57b8ff 100644
>> --- a/drivers/md/linear.c
>> +++ b/drivers/md/linear.c
>> @@ -128,6 +128,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>  	struct md_rdev *rdev;
>>  	int i, cnt;
>>  	bool discard_supported = false;
>> +	bool sg_gaps_disabled = false;
>>  
>>  	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
>>  			GFP_KERNEL);
>> @@ -163,6 +164,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>  
>>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>  			discard_supported = true;
>> +
>> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +			sg_gaps_disabled = true;
>>  	}
>>  	if (cnt != raid_disks) {
>>  		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
>> @@ -175,6 +179,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>  	else
>>  		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>  
>> +	if (!sg_gaps_disabled)
>> +		queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>> +	else
>> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>> +
>>  	/*
>>  	 * Here we calculate the device offsets.
>>  	 */
>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>> index 100ef23..2b77d0f 100644
>> --- a/drivers/md/raid0.c
>> +++ b/drivers/md/raid0.c
>> @@ -435,6 +435,7 @@ static int raid0_run(struct mddev *mddev)
>>  	if (mddev->queue) {
>>  		struct md_rdev *rdev;
>>  		bool discard_supported = false;
>> +		bool sg_gaps_disabled = false;
>>  
>>  		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>>  		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>> @@ -449,6 +450,8 @@ static int raid0_run(struct mddev *mddev)
>>  					  rdev->data_offset << 9);
>>  			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>  				discard_supported = true;
>> +			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +				sg_gaps_disabled = true;
>>  		}
>>  
>>  		/* Unfortunately, some devices have awful discard performance,
>> @@ -470,6 +473,11 @@ static int raid0_run(struct mddev *mddev)
>>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>  		else
>>  			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>> +
>> +		if (!sg_gaps_disabled)
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>> +		else
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>  	}
>>  
>>  	/* calculate array device size */
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index b45b64c..a1763b5 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1663,6 +1663,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>  	md_integrity_add_rdev(rdev, mddev);
>>  	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>  		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>> +	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>  	print_conf(conf);
>>  	return err;
>>  }
>> @@ -2905,6 +2907,7 @@ static int run(struct mddev *mddev)
>>  	struct md_rdev *rdev;
>>  	int ret;
>>  	bool discard_supported = false;
>> +	bool sg_gaps_disabled = false;
>>  
>>  	if (mddev->level != 1) {
>>  		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
>> @@ -2939,6 +2942,8 @@ static int run(struct mddev *mddev)
>>  				  rdev->data_offset << 9);
>>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>  			discard_supported = true;
>> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +			sg_gaps_disabled = true;
>>  	}
>>  
>>  	mddev->degraded = 0;
>> @@ -2976,6 +2981,12 @@ static int run(struct mddev *mddev)
>>  		else
>>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>  						  mddev->queue);
>> +		if (sg_gaps_disabled)
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>> +		else
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						  mddev->queue);
>>  	}
>>  
>>  	ret =  md_integrity_register(mddev);
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index f0c7f3b..52f8d73 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1865,6 +1865,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>  	md_integrity_add_rdev(rdev, mddev);
>>  	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>  		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>> +	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>  
>>  	print_conf(conf);
>>  	return err;
>> @@ -3650,6 +3652,7 @@ static int run(struct mddev *mddev)
>>  	sector_t min_offset_diff = 0;
>>  	int first = 1;
>>  	bool discard_supported = false;
>> +	bool sg_gaps_disabled = false;
>>  
>>  	if (mddev->private == NULL) {
>>  		conf = setup_conf(mddev);
>> @@ -3717,6 +3720,9 @@ static int run(struct mddev *mddev)
>>  
>>  		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>  			discard_supported = true;
>> +
>> +		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +			sg_gaps_disabled = true;
>>  	}
>>  
>>  	if (mddev->queue) {
>> @@ -3726,6 +3732,12 @@ static int run(struct mddev *mddev)
>>  		else
>>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>  						  mddev->queue);
>> +		if (sg_gaps_disabled)
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>> +		else
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						  mddev->queue);
>>  	}
>>  	/* need to check that every block has at least one working mirror */
>>  	if (!enough(conf, -1)) {
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 50902ad1..862b86f 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -6814,6 +6814,7 @@ static int run(struct mddev *mddev)
>>  	if (mddev->queue) {
>>  		int chunk_size;
>>  		bool discard_supported = true;
>> +		bool sg_gaps_disabled = false;
>>  		/* read-ahead size must cover two whole stripes, which
>>  		 * is 2 * (datadisks) * chunksize where 'n' is the
>>  		 * number of raid devices
>> @@ -6878,6 +6879,8 @@ static int run(struct mddev *mddev)
>>  				}
>>  				discard_supported = false;
>>  			}
>> +			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>> +				sg_gaps_disabled = true;
>>  		}
>>  
>>  		if (discard_supported &&
>> @@ -6888,6 +6891,13 @@ static int run(struct mddev *mddev)
>>  		else
>>  			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>  						mddev->queue);
>> +
>> +		if (sg_gaps_disabled)
>> +			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>> +		else
>> +			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>> +						mddev->queue);
>>  	}
>>  
>>  	return 0;
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 8b400c6..e3752b1 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -639,6 +639,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
>>  #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
>>  #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
>>  	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
>> +#define blk_queue_sg_gaps(q)	test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
>>  
>>  #define blk_noretry_request(rq) \
>>  	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
>>
>>
>
Vasily Averin Dec. 9, 2016, 6:12 a.m.
how about RHEL7.3 ?
if they are affected too -- it makes sense to report them.

On 12/09/2016 08:35 AM, Maxim Patlasov wrote:
> Mainline is not affected. (they re-worked the whole engine completely so the problem vanished auto-magically)
> 
> On 12/08/2016 09:21 PM, Vasily Averin wrote:
>> Maxim,
>> will you sent this to mainline too ?
>>
>> thank you,
>>     Vasily Averin
>>
>> On 12/09/2016 08:17 AM, Maxim Patlasov wrote:
>>> NVMe disk driver disables sg gaps by setting the flag in its queue.
>>> md-raids must propagate the flag to its queue as well. Otherwise, an upper
>>> user will get OK from bio_add_page() even if a gap exists, and that bio
>>> with a gap will crash NVMe driver who doesn't expect gaps.
>>> (see bvec_gap_to_prev for the definotion of gap)
>>>
>>> https://jira.sw.ru/browse/PSBM-56838
>>>
>>> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
>>> ---
>>>   drivers/md/linear.c    |    9 +++++++++
>>>   drivers/md/raid0.c     |    8 ++++++++
>>>   drivers/md/raid1.c     |   11 +++++++++++
>>>   drivers/md/raid10.c    |   12 ++++++++++++
>>>   drivers/md/raid5.c     |   10 ++++++++++
>>>   include/linux/blkdev.h |    1 +
>>>   6 files changed, 51 insertions(+)
>>>
>>> diff --git a/drivers/md/linear.c b/drivers/md/linear.c
>>> index 3310b59..e57b8ff 100644
>>> --- a/drivers/md/linear.c
>>> +++ b/drivers/md/linear.c
>>> @@ -128,6 +128,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>>       struct md_rdev *rdev;
>>>       int i, cnt;
>>>       bool discard_supported = false;
>>> +    bool sg_gaps_disabled = false;
>>>         conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
>>>               GFP_KERNEL);
>>> @@ -163,6 +164,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>>             if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>               discard_supported = true;
>>> +
>>> +        if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +            sg_gaps_disabled = true;
>>>       }
>>>       if (cnt != raid_disks) {
>>>           printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
>>> @@ -175,6 +179,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>>       else
>>>           queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>   +    if (!sg_gaps_disabled)
>>> +        queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>> +    else
>>> +        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>> +
>>>       /*
>>>        * Here we calculate the device offsets.
>>>        */
>>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>>> index 100ef23..2b77d0f 100644
>>> --- a/drivers/md/raid0.c
>>> +++ b/drivers/md/raid0.c
>>> @@ -435,6 +435,7 @@ static int raid0_run(struct mddev *mddev)
>>>       if (mddev->queue) {
>>>           struct md_rdev *rdev;
>>>           bool discard_supported = false;
>>> +        bool sg_gaps_disabled = false;
>>>             blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>>>           blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>>> @@ -449,6 +450,8 @@ static int raid0_run(struct mddev *mddev)
>>>                         rdev->data_offset << 9);
>>>               if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>                   discard_supported = true;
>>> +            if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +                sg_gaps_disabled = true;
>>>           }
>>>             /* Unfortunately, some devices have awful discard performance,
>>> @@ -470,6 +473,11 @@ static int raid0_run(struct mddev *mddev)
>>>               queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>           else
>>>               queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>> +
>>> +        if (!sg_gaps_disabled)
>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>> +        else
>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>       }
>>>         /* calculate array device size */
>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>> index b45b64c..a1763b5 100644
>>> --- a/drivers/md/raid1.c
>>> +++ b/drivers/md/raid1.c
>>> @@ -1663,6 +1663,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>>       md_integrity_add_rdev(rdev, mddev);
>>>       if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>           queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>> +    if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>       print_conf(conf);
>>>       return err;
>>>   }
>>> @@ -2905,6 +2907,7 @@ static int run(struct mddev *mddev)
>>>       struct md_rdev *rdev;
>>>       int ret;
>>>       bool discard_supported = false;
>>> +    bool sg_gaps_disabled = false;
>>>         if (mddev->level != 1) {
>>>           printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
>>> @@ -2939,6 +2942,8 @@ static int run(struct mddev *mddev)
>>>                     rdev->data_offset << 9);
>>>           if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>               discard_supported = true;
>>> +        if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +            sg_gaps_disabled = true;
>>>       }
>>>         mddev->degraded = 0;
>>> @@ -2976,6 +2981,12 @@ static int run(struct mddev *mddev)
>>>           else
>>>               queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>>                             mddev->queue);
>>> +        if (sg_gaps_disabled)
>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>>> +                        mddev->queue);
>>> +        else
>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>>> +                          mddev->queue);
>>>       }
>>>         ret =  md_integrity_register(mddev);
>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>> index f0c7f3b..52f8d73 100644
>>> --- a/drivers/md/raid10.c
>>> +++ b/drivers/md/raid10.c
>>> @@ -1865,6 +1865,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>>       md_integrity_add_rdev(rdev, mddev);
>>>       if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>           queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>> +    if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>         print_conf(conf);
>>>       return err;
>>> @@ -3650,6 +3652,7 @@ static int run(struct mddev *mddev)
>>>       sector_t min_offset_diff = 0;
>>>       int first = 1;
>>>       bool discard_supported = false;
>>> +    bool sg_gaps_disabled = false;
>>>         if (mddev->private == NULL) {
>>>           conf = setup_conf(mddev);
>>> @@ -3717,6 +3720,9 @@ static int run(struct mddev *mddev)
>>>             if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>               discard_supported = true;
>>> +
>>> +        if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +            sg_gaps_disabled = true;
>>>       }
>>>         if (mddev->queue) {
>>> @@ -3726,6 +3732,12 @@ static int run(struct mddev *mddev)
>>>           else
>>>               queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>>                             mddev->queue);
>>> +        if (sg_gaps_disabled)
>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>>> +                        mddev->queue);
>>> +        else
>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>>> +                          mddev->queue);
>>>       }
>>>       /* need to check that every block has at least one working mirror */
>>>       if (!enough(conf, -1)) {
>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>> index 50902ad1..862b86f 100644
>>> --- a/drivers/md/raid5.c
>>> +++ b/drivers/md/raid5.c
>>> @@ -6814,6 +6814,7 @@ static int run(struct mddev *mddev)
>>>       if (mddev->queue) {
>>>           int chunk_size;
>>>           bool discard_supported = true;
>>> +        bool sg_gaps_disabled = false;
>>>           /* read-ahead size must cover two whole stripes, which
>>>            * is 2 * (datadisks) * chunksize where 'n' is the
>>>            * number of raid devices
>>> @@ -6878,6 +6879,8 @@ static int run(struct mddev *mddev)
>>>                   }
>>>                   discard_supported = false;
>>>               }
>>> +            if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>> +                sg_gaps_disabled = true;
>>>           }
>>>             if (discard_supported &&
>>> @@ -6888,6 +6891,13 @@ static int run(struct mddev *mddev)
>>>           else
>>>               queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>>                           mddev->queue);
>>> +
>>> +        if (sg_gaps_disabled)
>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>>> +                        mddev->queue);
>>> +        else
>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>>> +                        mddev->queue);
>>>       }
>>>         return 0;
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index 8b400c6..e3752b1 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -639,6 +639,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
>>>   #define blk_queue_discard(q)    test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
>>>   #define blk_queue_secdiscard(q)    (blk_queue_discard(q) && \
>>>       test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
>>> +#define blk_queue_sg_gaps(q)    test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
>>>     #define blk_noretry_request(rq) \
>>>       ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
>>>
>>>
> 
>
Maxim Patlasov Dec. 9, 2016, 9:11 p.m.
It's fixed in RHEL7.3 since 3.10.0-514.el7. So all our newer kernels 
(3.10.0-514.vz7.27.*) are not affected.


On 12/08/2016 10:12 PM, Vasily Averin wrote:
> how about RHEL7.3 ?
> if they are affected too -- it makes sense to report them.
>
> On 12/09/2016 08:35 AM, Maxim Patlasov wrote:
>> Mainline is not affected. (they re-worked the whole engine completely so the problem vanished auto-magically)
>>
>> On 12/08/2016 09:21 PM, Vasily Averin wrote:
>>> Maxim,
>>> will you sent this to mainline too ?
>>>
>>> thank you,
>>>      Vasily Averin
>>>
>>> On 12/09/2016 08:17 AM, Maxim Patlasov wrote:
>>>> NVMe disk driver disables sg gaps by setting the flag in its queue.
>>>> md-raids must propagate the flag to its queue as well. Otherwise, an upper
>>>> user will get OK from bio_add_page() even if a gap exists, and that bio
>>>> with a gap will crash NVMe driver who doesn't expect gaps.
>>>> (see bvec_gap_to_prev for the definotion of gap)
>>>>
>>>> https://jira.sw.ru/browse/PSBM-56838
>>>>
>>>> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
>>>> ---
>>>>    drivers/md/linear.c    |    9 +++++++++
>>>>    drivers/md/raid0.c     |    8 ++++++++
>>>>    drivers/md/raid1.c     |   11 +++++++++++
>>>>    drivers/md/raid10.c    |   12 ++++++++++++
>>>>    drivers/md/raid5.c     |   10 ++++++++++
>>>>    include/linux/blkdev.h |    1 +
>>>>    6 files changed, 51 insertions(+)
>>>>
>>>> diff --git a/drivers/md/linear.c b/drivers/md/linear.c
>>>> index 3310b59..e57b8ff 100644
>>>> --- a/drivers/md/linear.c
>>>> +++ b/drivers/md/linear.c
>>>> @@ -128,6 +128,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>>>        struct md_rdev *rdev;
>>>>        int i, cnt;
>>>>        bool discard_supported = false;
>>>> +    bool sg_gaps_disabled = false;
>>>>          conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
>>>>                GFP_KERNEL);
>>>> @@ -163,6 +164,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>>>              if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>>                discard_supported = true;
>>>> +
>>>> +        if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +            sg_gaps_disabled = true;
>>>>        }
>>>>        if (cnt != raid_disks) {
>>>>            printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
>>>> @@ -175,6 +179,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
>>>>        else
>>>>            queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>>    +    if (!sg_gaps_disabled)
>>>> +        queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>> +    else
>>>> +        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>> +
>>>>        /*
>>>>         * Here we calculate the device offsets.
>>>>         */
>>>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>>>> index 100ef23..2b77d0f 100644
>>>> --- a/drivers/md/raid0.c
>>>> +++ b/drivers/md/raid0.c
>>>> @@ -435,6 +435,7 @@ static int raid0_run(struct mddev *mddev)
>>>>        if (mddev->queue) {
>>>>            struct md_rdev *rdev;
>>>>            bool discard_supported = false;
>>>> +        bool sg_gaps_disabled = false;
>>>>              blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>>>>            blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>>>> @@ -449,6 +450,8 @@ static int raid0_run(struct mddev *mddev)
>>>>                          rdev->data_offset << 9);
>>>>                if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>>                    discard_supported = true;
>>>> +            if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +                sg_gaps_disabled = true;
>>>>            }
>>>>              /* Unfortunately, some devices have awful discard performance,
>>>> @@ -470,6 +473,11 @@ static int raid0_run(struct mddev *mddev)
>>>>                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>>            else
>>>>                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>> +
>>>> +        if (!sg_gaps_disabled)
>>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>> +        else
>>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>>        }
>>>>          /* calculate array device size */
>>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>>> index b45b64c..a1763b5 100644
>>>> --- a/drivers/md/raid1.c
>>>> +++ b/drivers/md/raid1.c
>>>> @@ -1663,6 +1663,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>>>        md_integrity_add_rdev(rdev, mddev);
>>>>        if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>>            queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>> +    if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>>        print_conf(conf);
>>>>        return err;
>>>>    }
>>>> @@ -2905,6 +2907,7 @@ static int run(struct mddev *mddev)
>>>>        struct md_rdev *rdev;
>>>>        int ret;
>>>>        bool discard_supported = false;
>>>> +    bool sg_gaps_disabled = false;
>>>>          if (mddev->level != 1) {
>>>>            printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
>>>> @@ -2939,6 +2942,8 @@ static int run(struct mddev *mddev)
>>>>                      rdev->data_offset << 9);
>>>>            if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>>                discard_supported = true;
>>>> +        if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +            sg_gaps_disabled = true;
>>>>        }
>>>>          mddev->degraded = 0;
>>>> @@ -2976,6 +2981,12 @@ static int run(struct mddev *mddev)
>>>>            else
>>>>                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>>>                              mddev->queue);
>>>> +        if (sg_gaps_disabled)
>>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>>>> +                        mddev->queue);
>>>> +        else
>>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>>>> +                          mddev->queue);
>>>>        }
>>>>          ret =  md_integrity_register(mddev);
>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>> index f0c7f3b..52f8d73 100644
>>>> --- a/drivers/md/raid10.c
>>>> +++ b/drivers/md/raid10.c
>>>> @@ -1865,6 +1865,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>>>>        md_integrity_add_rdev(rdev, mddev);
>>>>        if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>>            queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
>>>> +    if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
>>>>          print_conf(conf);
>>>>        return err;
>>>> @@ -3650,6 +3652,7 @@ static int run(struct mddev *mddev)
>>>>        sector_t min_offset_diff = 0;
>>>>        int first = 1;
>>>>        bool discard_supported = false;
>>>> +    bool sg_gaps_disabled = false;
>>>>          if (mddev->private == NULL) {
>>>>            conf = setup_conf(mddev);
>>>> @@ -3717,6 +3720,9 @@ static int run(struct mddev *mddev)
>>>>              if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>>>>                discard_supported = true;
>>>> +
>>>> +        if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +            sg_gaps_disabled = true;
>>>>        }
>>>>          if (mddev->queue) {
>>>> @@ -3726,6 +3732,12 @@ static int run(struct mddev *mddev)
>>>>            else
>>>>                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>>>                              mddev->queue);
>>>> +        if (sg_gaps_disabled)
>>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>>>> +                        mddev->queue);
>>>> +        else
>>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>>>> +                          mddev->queue);
>>>>        }
>>>>        /* need to check that every block has at least one working mirror */
>>>>        if (!enough(conf, -1)) {
>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> index 50902ad1..862b86f 100644
>>>> --- a/drivers/md/raid5.c
>>>> +++ b/drivers/md/raid5.c
>>>> @@ -6814,6 +6814,7 @@ static int run(struct mddev *mddev)
>>>>        if (mddev->queue) {
>>>>            int chunk_size;
>>>>            bool discard_supported = true;
>>>> +        bool sg_gaps_disabled = false;
>>>>            /* read-ahead size must cover two whole stripes, which
>>>>             * is 2 * (datadisks) * chunksize where 'n' is the
>>>>             * number of raid devices
>>>> @@ -6878,6 +6879,8 @@ static int run(struct mddev *mddev)
>>>>                    }
>>>>                    discard_supported = false;
>>>>                }
>>>> +            if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
>>>> +                sg_gaps_disabled = true;
>>>>            }
>>>>              if (discard_supported &&
>>>> @@ -6888,6 +6891,13 @@ static int run(struct mddev *mddev)
>>>>            else
>>>>                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
>>>>                            mddev->queue);
>>>> +
>>>> +        if (sg_gaps_disabled)
>>>> +            queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
>>>> +                        mddev->queue);
>>>> +        else
>>>> +            queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
>>>> +                        mddev->queue);
>>>>        }
>>>>          return 0;
>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>> index 8b400c6..e3752b1 100644
>>>> --- a/include/linux/blkdev.h
>>>> +++ b/include/linux/blkdev.h
>>>> @@ -639,6 +639,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
>>>>    #define blk_queue_discard(q)    test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
>>>>    #define blk_queue_secdiscard(q)    (blk_queue_discard(q) && \
>>>>        test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
>>>> +#define blk_queue_sg_gaps(q)    test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
>>>>      #define blk_noretry_request(rq) \
>>>>        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
>>>>
>>>>
>>