[RH7,8/9] block: hook up writeback throttling

Submitted by Pavel Tikhomirov on Sept. 21, 2019, 8:32 a.m.

Details

Message ID 20190921083227.8990-9-ptikhomirov@virtuozzo.com
State New
Series "block: backport writeback throttling"
Headers show

Commit Message

Pavel Tikhomirov Sept. 21, 2019, 8:32 a.m.
From: Jens Axboe <axboe@fb.com>

Enable throttling of buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

Unlike CoDel, blk-wb allows the scale count to to negative. This
happens if we primarily have writes going on. Unlike positive
scale counts, this doesn't change the size of the monitoring window.
When the heavy writers finish, blk-bw quickly snaps back to it's
stable state of a zero scale count.

The patch registers a sysfs entry, 'wb_lat_usec'. This sets the latency
target to me met. It defaults to 2 msec for non-rotational storage, and
75 msec for rotational storage. Setting this value to '0' disables
blk-wb. Generally, a user would not have to touch this setting.

We don't enable WBT on devices that are managed with CFQ, and have
a non-root block cgroup attached. If we have a proportional share setup
on this particular disk, then the wbt throttling will interfere with
that. We don't have a strong need for wbt for that case, since we will
rely on CFQ doing that for us.

Signed-off-by: Jens Axboe <axboe@fb.com>

https://jira.sw.ru/browse/PSBM-96243

(cherry picked from commit 87760e5eef359788047d6fd54fc12eec74ce0d27)

Also merged non block/blk-wbt* hunks of previous patch from:
8054b89f8fca ("blk-wbt: remove stat ops")),
fa224eed2b5e ("blk-wbt: cleanup disable-by-default for CFQ"),
a8a45941706b ("block: pass struct request instead of struct blk_issue_stat to wbt")

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 Documentation/block/queue-sysfs.txt |  7 +++
 block/Kconfig                       | 26 ++++++++++++
 block/blk-core.c                    | 20 ++++++---
 block/blk-mq.c                      | 17 +++++++-
 block/blk-settings.c                |  4 ++
 block/blk-sysfs.c                   | 66 +++++++++++++++++++++++++++++
 block/cfq-iosched.c                 | 11 +++++
 include/linux/blkdev.h              |  3 ++
 8 files changed, 148 insertions(+), 6 deletions(-)

Patch hide | download patch | download mbox

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index e54ac1d53403..9a19a9ab9774 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -134,5 +134,12 @@  an IO scheduler name to this file will attempt to load that IO scheduler
 module, if it isn't already present in the system.
 
 
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/Kconfig b/block/Kconfig
index bccfb7014208..18dfd93ef8b2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -112,6 +112,32 @@  config BLK_DEBUG_FS
 	Unless you are building a kernel for a tiny system, you should
 	say Y here.
 
+config BLK_WBT
+	bool "Enable support for block device writeback throttling"
+	default n
+	---help---
+	Enabling this option enables the block layer to throttle buffered
+	background writeback from the VM, making it more smooth and having
+	less impact on foreground operations. The throttling is done
+	dynamically on an algorithm loosely based on CoDel, factoring in
+	the realtime performance of the disk.
+
+config BLK_WBT_SQ
+	bool "Single queue writeback throttling"
+	default n
+	depends on BLK_WBT
+	---help---
+	Enable writeback throttling by default on legacy single queue devices
+
+config BLK_WBT_MQ
+	bool "Multiqueue writeback throttling"
+	default y
+	depends on BLK_WBT
+	---help---
+	Enable writeback throttling by default on multiqueue devices.
+	Multiqueue currently doesn't have support for IO scheduling,
+	enabling this option is recommended.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/blk-core.c b/block/blk-core.c
index 40c313db03f5..2fc2a6247bf0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -42,6 +42,7 @@ 
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
+#include "blk-wbt.h"
 
 #ifdef CONFIG_DEBUG_FS
 struct dentry *blk_debugfs_root;
@@ -1016,6 +1017,7 @@  int blk_init_allocated_queue(struct request_queue *q)
 		q->exit_rq_fn(q, q->fq->flush_rq);
 out_free_flush_queue:
 	blk_free_flush_queue(q->fq);
+	wbt_exit(q);
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1573,6 +1575,7 @@  void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, rq);
 
 	if (rq->cmd_flags & REQ_QUEUED)
 		blk_queue_end_tag(q, rq);
@@ -1677,6 +1680,8 @@  void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
+	wbt_done(q->rq_wb, req);
+
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
@@ -1912,6 +1917,7 @@  void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
 	struct request *req, *free;
 	unsigned int request_count = 0;
+	unsigned int wb_acct;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1967,6 +1973,8 @@  void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	}
 
 get_rq:
+	wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
@@ -1983,11 +1991,14 @@  void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	blk_queue_enter_live(q);
 	req = get_request(q, rw_flags, bio, 0);
 	if (IS_ERR(req)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		blk_queue_exit(q);
 		bio_endio(bio, PTR_ERR(req));	/* @q is dead */
 		goto out_unlock;
 	}
 
+	wbt_track(req, wb_acct);
+
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
@@ -2767,13 +2778,11 @@  void blk_start_request(struct request *req)
 {
 	blk_dequeue_request(req);
 
-	/* blk-stat isn't used on non-mq now, so disable it until it is needed */
-#if 0
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
 		req->io_start_time_ns = ktime_get_ns();
 		req->cmd_flags |= REQ_STATS;
+		wbt_issue(req->q->rq_wb, req);
 	}
-#endif
 
 	/*
 	 * We are now handing the request to the hardware, initialize
@@ -3012,9 +3021,10 @@  void blk_finish_request(struct request *req, int error)
 
 	blk_account_io_done(req);
 
-	if (req->end_io)
+	if (req->end_io) {
+		wbt_done(req->q->rq_wb, req);
 		req->end_io(req, error);
-	else {
+	} else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8e230b979b23..d199069b01fe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@ 
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
 #include "blk-stat.h"
+#include "blk-wbt.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -441,6 +442,8 @@  void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
+
+	wbt_done(q->rq_wb, rq);
 	rq->cmd_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -478,6 +481,7 @@  inline void __blk_mq_end_request(struct request *rq, int error)
 	blk_account_io_done(rq);
 
 	if (rq->end_io) {
+		wbt_done(rq->q->rq_wb, rq);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -624,6 +628,7 @@  void blk_mq_start_request(struct request *rq)
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 		rq->io_start_time_ns = ktime_get_ns();
 		rq->cmd_flags |= REQ_STATS;
+		wbt_issue(q->rq_wb, rq);
 	}
 
 	blk_add_timer(rq);
@@ -672,6 +677,7 @@  static void __blk_mq_requeue_request(struct request *rq)
 	blk_mq_put_driver_tag(rq);
 
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -1826,6 +1832,7 @@  static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1844,11 +1851,17 @@  static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (blk_mq_merge_bio(q, bio))
 		return;
 
+	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
 	trace_block_getrq(q, bio, bio->bi_rw);
 
 	rq = blk_mq_sched_get_request(q, bio, bio->bi_rw, &data);
-	if (unlikely(!rq))
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		return;
+	}
+
+	wbt_track(rq, wb_acct);
 
 	plug = current->plug;
 	if (unlikely(is_flush_fua)) {
@@ -2668,6 +2681,8 @@  void blk_mq_free_queue(struct request_queue *q)
 	list_del_init(&q->all_q_node);
 	mutex_unlock(&all_q_mutex);
 
+	wbt_exit(q);
+
 	blk_mq_del_queue_tag_set(q);
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5b9441e7c6a8..b8925b777270 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -13,6 +13,7 @@ 
 #include <linux/gfp.h>
 
 #include "blk.h"
+#include "blk-wbt.h"
 
 unsigned long blk_max_low_pfn;
 EXPORT_SYMBOL(blk_max_low_pfn);
@@ -873,6 +874,8 @@  void blk_queue_flush(struct request_queue *q, unsigned int flush)
 		flush &= ~REQ_FUA;
 
 	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+
+	wbt_set_write_cache(q->rq_wb, q->flush_flags & REQ_FLUSH);
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush);
 
@@ -891,6 +894,7 @@  EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
 	q->queue_depth = depth;
+	wbt_set_queue_depth(q->rq_wb, depth);
 }
 EXPORT_SYMBOL(blk_set_queue_depth);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7d58cb02adeb..ed5c71a8e84f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@ 
 #include "blk-cgroup.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
+#include "blk-wbt.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -41,6 +42,19 @@  queue_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+	int err;
+	u64 v;
+
+	err = kstrtou64(page, 10, &v);
+	if (err < 0)
+		return err;
+
+	*var = v;
+	return 0;
+}
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->nr_requests, (page));
@@ -318,6 +332,31 @@  queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	ssize_t ret;
+	u64 val;
+
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	ret = queue_var_store64(&val, page);
+	if (ret < 0)
+		return ret;
+
+	q->rq_wb->min_lat_nsec = val * 1000ULL;
+	wbt_update_limits(q->rq_wb);
+	return count;
+}
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -443,6 +482,12 @@  static struct queue_sysfs_entry queue_random_entry = {
 	.store = queue_store_random,
 };
 
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wb_lat_show,
+	.store = queue_wb_lat_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -467,6 +512,7 @@  static struct attribute *default_attrs[] = {
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
 	&queue_random_entry.attr,
+	&queue_wb_lat_entry.attr,
 	NULL,
 };
 
@@ -541,6 +587,7 @@  static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
+	wbt_exit(q);
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
@@ -583,6 +630,23 @@  struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void blk_wb_init(struct request_queue *q)
+{
+#ifndef CONFIG_BLK_WBT_MQ
+	if (q->mq_ops)
+		return;
+#endif
+#ifndef CONFIG_BLK_WBT_SQ
+	if (q->request_fn)
+		return;
+#endif
+
+	/*
+	 * If this fails, we don't get throttling
+	 */
+	wbt_init(q);
+}
+
 /**
  * blk_register_queue - register a block layer queue with sysfs
  * @disk: Disk of which the request queue should be registered with sysfs.
@@ -640,6 +704,8 @@  int blk_register_queue(struct gendisk *disk)
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
+	blk_wb_init(q);
+
 	if (q->request_fn || (q->mq_ops && q->elevator)) {
 		ret = elv_register_queue(q);
 		if (ret) {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d4440202d7a9..1af4377ba938 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -19,6 +19,7 @@ 
 
 #include "blk.h"
 #include "blk-cgroup.h"
+#include "blk-wbt.h"
 
 /*
  * tunables
@@ -3618,9 +3619,11 @@  static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *sync_cfqq;
 	uint64_t id;
+	bool nonroot_cg;
 
 	rcu_read_lock();
 	id = bio_blkcg(bio)->id;
+	nonroot_cg = bio_blkcg(bio) != &blkcg_root;
 	rcu_read_unlock();
 
 	/*
@@ -3630,6 +3633,14 @@  static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
 		return;
 
+	/*
+	 * If we have a non-root cgroup, we can depend on that to
+	 * do proper throttling of writes. Turn off wbt for that
+	 * case, if it was enabled by default.
+	 */
+	if (nonroot_cg)
+		wbt_disable_default(cfqd->queue);
+
 	sync_cfqq = cic_to_cfqq(cic, 1);
 	if (sync_cfqq) {
 		/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6ea3889d82e8..190b8461ac68 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -43,6 +43,7 @@  struct blk_flush_queue;
 struct pr_ops;
 struct blk_queue_stats;
 struct blk_stat_callback;
+struct rq_wb;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -372,6 +373,8 @@  struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	struct rq_wb		*rq_wb;
+
 	/*
 	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
 	 * is used, root blkg allocates from @q->root_rl and all other