From 60dc958d0a00d0ef7006bc8ee705c502dc1537f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2016 12:32:46 -0600 Subject: [PATCH 01/28] block: add ability to flag write back caching on a device Add an internal helper and flag for setting whether a queue has write back caching, or write through (or none). Add a sysfs file to show this as well, and make it changeable from user space. This will replace the (awkward) blk_queue_flush() interface that drivers currently use to inform the block layer of write cache state and capabilities. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- Documentation/block/queue-sysfs.txt | 9 +++++++ block/blk-settings.c | 26 +++++++++++++++++++ block/blk-sysfs.c | 39 +++++++++++++++++++++++++++++ include/linux/blkdev.h | 3 +++ 4 files changed, 77 insertions(+) diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index e5d914845be6de..dce25d848d92c6 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -141,6 +141,15 @@ control of this block device to that new IO scheduler. Note that writing an IO scheduler name to this file will attempt to load that IO scheduler module, if it isn't already present in the system. +write_cache (RW) +---------------- +When read, this file will display whether the device has write back +caching enabled or not. It will return "write back" for the former +case, and "write through" for the latter. Writing to this file can +change the kernels view of the device, but it doesn't alter the +device state. This means that it might not be safe to toggle the +setting from "write back" to "write through", since that will also +eliminate cache flushes issued by the kernel. Jens Axboe , February 2009 diff --git a/block/blk-settings.c b/block/blk-settings.c index 331e4eee0dda0c..c903bee43cf852 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -846,6 +846,32 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); +/** + * blk_queue_write_cache - configure queue's write cache + * @q: the request queue for the device + * @wc: write back cache on or off + * @fua: device supports FUA writes, if true + * + * Tell the block layer about the write cache of @q. + */ +void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) +{ + spin_lock_irq(q->queue_lock); + if (wc) { + queue_flag_set(QUEUE_FLAG_WC, q); + q->flush_flags = REQ_FLUSH; + } else + queue_flag_clear(QUEUE_FLAG_WC, q); + if (fua) { + if (wc) + q->flush_flags |= REQ_FUA; + queue_flag_set(QUEUE_FLAG_FUA, q); + } else + queue_flag_clear(QUEUE_FLAG_FUA, q); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_write_cache); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 995b58d46ed109..99205965f5596c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -347,6 +347,38 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wc_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return sprintf(page, "write back\n"); + + return sprintf(page, "write through\n"); +} + +static ssize_t queue_wc_store(struct request_queue *q, const char *page, + size_t count) +{ + int set = -1; + + if (!strncmp(page, "write back", 10)) + set = 1; + else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) + set = 0; + + if (set == -1) + return -EINVAL; + + spin_lock_irq(q->queue_lock); + if (set) + queue_flag_set(QUEUE_FLAG_WC, q); + else + queue_flag_clear(QUEUE_FLAG_WC, q); + spin_unlock_irq(q->queue_lock); + + return count; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -478,6 +510,12 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_wc_entry = { + .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wc_show, + .store = queue_wc_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -503,6 +541,7 @@ static struct attribute *default_attrs[] = { &queue_iostats_entry.attr, &queue_random_entry.attr, &queue_poll_entry.attr, + &queue_wc_entry.attr, NULL, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 669e419d62347e..18b7ff2184dd7d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -491,6 +491,8 @@ struct request_queue { #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ #define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ +#define QUEUE_FLAG_WC 23 /* Write back caching */ +#define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -1009,6 +1011,7 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush(struct request_queue *q, unsigned int flush); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); +extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); From ee1cf78fa4ab1057cf5b9b672a4deb4cd65e2d67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:06:11 -0600 Subject: [PATCH 02/28] sd: switch to using blk_queue_write_cache() Switch to the newer interface, instead of using blk_queue_flush() directly. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig Acked-by: Martin K. Petersen --- drivers/scsi/sd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f52b74cf8d1e69..ff3b7e65843ce0 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -137,15 +137,15 @@ static const char *sd_cache_types[] = { static void sd_set_flush_flag(struct scsi_disk *sdkp) { - unsigned flush = 0; + bool wc = false, fua = false; if (sdkp->WCE) { - flush |= REQ_FLUSH; + wc = true; if (sdkp->DPOFUA) - flush |= REQ_FUA; + fua = true; } - blk_queue_flush(sdkp->disk->queue, flush); + blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t From a614eb70932a9e52395dcc12f1e09d60a236db2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2016 15:43:09 -0600 Subject: [PATCH 03/28] NVMe: switch to using blk_queue_write_cache() Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 643f457131c24f..c4f9feb0cd5179 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -894,6 +894,8 @@ EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { + bool vwc = false; + if (ctrl->max_hw_sectors) { u32 max_segments = (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; @@ -903,9 +905,10 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, } if (ctrl->stripe_size) blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); blk_queue_virt_boundary(q, ctrl->page_size - 1); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); } /* From 49c6b62a376b87c4978517fad426f364b51f5b94 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:09:01 -0600 Subject: [PATCH 04/28] drbd: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/drbd/drbd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa209773d494a4..2ba1494b279997 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2761,7 +2761,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info.congested_data = device; blk_queue_make_request(q, drbd_make_request); - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); From 33b7035bb9037af39bb1a66aa7b09e34670f32a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:09:35 -0600 Subject: [PATCH 05/28] loop: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 80cf8add46ff36..1fa8cc235977f4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -943,7 +943,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_flush(lo->lo_queue, REQ_FLUSH); + blk_queue_write_cache(lo->lo_queue, true, false); loop_update_dio(lo); set_capacity(lo->lo_disk, size); From 14937a3cb3da4304f24ef1c2a67c518c9d618e24 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2016 15:45:39 -0600 Subject: [PATCH 06/28] mtip32xx: remove call to blk_queue_flush() The driver calls it with 0 for flags, since it doesn't have a writeback cache. Just remove the call, as it's a no-op right now. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/mtip32xx/mtip32xx.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 25824c1697c508..c383e90ec69c02 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4023,12 +4023,6 @@ static int mtip_block_initialize(struct driver_data *dd) blk_queue_io_min(dd->queue, 4096); blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask); - /* - * write back cache is not supported in the device. FUA depends on - * write back cache support, hence setting flush support to zero. - */ - blk_queue_flush(dd->queue, 0); - /* Signal trim support */ if (dd->trim_supp == true) { set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); From 488c6bb7f3b3c2b5322cf4169634be8472201d63 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:10:53 -0600 Subject: [PATCH 07/28] nbd: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/nbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 08afbc7a2bb871..31e73a7a40f20e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -693,9 +693,9 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) if (nbd->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); if (nbd->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + blk_queue_write_cache(nbd->disk->queue, true, false); else - blk_queue_flush(nbd->disk->queue, 0); + blk_queue_write_cache(nbd->disk->queue, false, false); } static int nbd_dev_dbg_init(struct nbd_device *nbd); From ba09f79f076f47a8dd723d73c0e5fc00892173f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:11:15 -0600 Subject: [PATCH 08/28] osdblk: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/osdblk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 1b709a4e3b5ec6..c2854a2bfdb0bd 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -437,7 +437,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); blk_queue_prep_rq(q, blk_queue_start_tag); - blk_queue_flush(q, REQ_FLUSH); + blk_queue_write_cache(q, true, false); disk->queue = q; From 9810844d3bf7d363930cb50acf1680476374cf31 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:11:42 -0600 Subject: [PATCH 09/28] skd_main: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/skd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 586f9168ffa482..35276becd24e94 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4412,7 +4412,7 @@ static int skd_cons_disk(struct skd_device *skdev) disk->queue = q; q->queuedata = skdev; - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); blk_queue_max_segments(q, skdev->sgs_per_request); blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); From af9a3b67b7c0b51af29e43db3a54eda0d16d210b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:12:13 -0600 Subject: [PATCH 10/28] ps3disk: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/ps3disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index c120d70d3fb3b3..4b7e405830d7ec 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -468,7 +468,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_dma_alignment(queue, dev->blk_size-1); blk_queue_logical_block_size(queue, dev->blk_size); - blk_queue_flush(queue, REQ_FLUSH); + blk_queue_write_cache(queue, true, false); blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); From 100a546f15d3886d5519f61001b4705d8af859fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:12:58 -0600 Subject: [PATCH 11/28] virtio_blk: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/virtio_blk.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 28cff0d23d829e..42758b52768cf8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -493,11 +493,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) u8 writeback = virtblk_get_cache_mode(vdev); struct virtio_blk *vblk = vdev->priv; - if (writeback) - blk_queue_flush(vblk->disk->queue, REQ_FLUSH); - else - blk_queue_flush(vblk->disk->queue, 0); - + blk_queue_write_cache(vblk->disk->queue, writeback, false); revalidate_disk(vblk->disk); } From 59037cfd0390bd643c8a1d405274553f2a64b3e6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:13:22 -0600 Subject: [PATCH 12/28] bcache: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a296425a7270df..f5dbb4e884d893 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); - blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + blk_queue_write_cache(q, true, true); return 0; } From 10b73dadf1e5cce397688409e7e60cdff5fff5e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:14:14 -0600 Subject: [PATCH 13/28] dm: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/md/dm-table.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9e8f0bef3323e..4b1ffc0abe111e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1506,7 +1506,7 @@ static bool dm_table_supports_discards(struct dm_table *t) void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - unsigned flush = 0; + bool wc = false, fua = false; /* * Copy table's limits to the DM device's request_queue @@ -1519,11 +1519,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); if (dm_table_supports_flush(t, REQ_FLUSH)) { - flush |= REQ_FLUSH; + wc = true; if (dm_table_supports_flush(t, REQ_FUA)) - flush |= REQ_FUA; + fua = true; } - blk_queue_flush(q, flush); + blk_queue_write_cache(q, wc, fua); if (!dm_table_discard_zeroes_data(t)) q->limits.discard_zeroes_data = 0; From 2ddefd2dbdac9a604a1d63041e6013873cc9d5d5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:15:21 -0600 Subject: [PATCH 14/28] xen-blkfront: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/block/xen-blkfront.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6405b655779268..ca13df8546396c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -998,7 +998,8 @@ static const char *flush_info(unsigned int feature_flush) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_flush(info->rq, info->feature_flush); + blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH, + info->feature_flush & REQ_FUA); pr_info("blkfront: %s: %s %s %s %s %s\n", info->gd->disk_name, flush_info(info->feature_flush), "persistent grants:", info->feature_persistent ? From 31717643ba0a645959f7a8e9f754bdb396d04dfb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:16:25 -0600 Subject: [PATCH 15/28] ide-disk: update to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/ide/ide-disk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 37a8a907febeb2..05dbcce70b0e33 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -522,7 +522,7 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect) static void update_flush(ide_drive_t *drive) { u16 *id = drive->id; - unsigned flush = 0; + bool wc = false; if (drive->dev_flags & IDE_DFLAG_WCACHE) { unsigned long long capacity; @@ -546,12 +546,12 @@ static void update_flush(ide_drive_t *drive) drive->name, barrier ? "" : "not "); if (barrier) { - flush = REQ_FLUSH; + wc = true; blk_queue_prep_rq(drive->queue, idedisk_prep_fn); } } - blk_queue_flush(drive->queue, flush); + blk_queue_write_cache(drive->queue, wc, false); } ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE); From a88a43c1ebe46d443ad64a2b381cf0516d4c8ff3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:16:53 -0600 Subject: [PATCH 16/28] md: update to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 194580fba7fd8b..5d61e76cec343d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5037,7 +5037,7 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(mddev->queue, true, true); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. From 6762c30d28db251e761e5c601a46ea38ea5cdfaa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:17:20 -0600 Subject: [PATCH 17/28] mmc/block: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/mmc/card/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 8a0147dfed27d8..8db2bf0a89df4c 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -2271,7 +2271,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || card->ext_csd.rel_sectors)) { md->flags |= MMC_BLK_REL_WR; - blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(md->queue.queue, true, true); } if (mmc_card_mmc(card) && From 793e3923965bd3364a5ba4b6e12c0d20b35fd61c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:17:47 -0600 Subject: [PATCH 18/28] mtd: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- drivers/mtd/mtd_blkdevs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f4701182b558ec..74ae24364a8db3 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -409,7 +409,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto error3; if (tr->flush) - blk_queue_flush(new->rq, REQ_FLUSH); + blk_queue_write_cache(new->rq, true, false); new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); From f7a85c4a446a1a8d2e1cc239de86823490e72230 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:19:17 -0600 Subject: [PATCH 19/28] um: switch to using blk_queue_write_cache() Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- arch/um/drivers/ubd_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 39ba20755e0356..17e96dc29596cc 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -862,7 +862,7 @@ static int ubd_add(int n, char **error_out) goto out; } ubd_dev->queue->queuedata = ubd_dev; - blk_queue_flush(ubd_dev->queue, REQ_FLUSH); + blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_max_segments(ubd_dev->queue, MAX_SG); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); From d7709d65bd839a323e77283c6c633ec42d1ff187 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:19:30 -0600 Subject: [PATCH 20/28] block: kill blk_queue_flush() We don't have any drivers left using it, so kill it off. Update documentation to use the newer blk_queue_write_cache(). Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- .../block/writeback_cache_control.txt | 4 ++-- block/blk-settings.c | 20 ------------------- include/linux/blkdev.h | 1 - 3 files changed, 2 insertions(+), 23 deletions(-) diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt index 83407d36630a6d..59e0516cbf6b68 100644 --- a/Documentation/block/writeback_cache_control.txt +++ b/Documentation/block/writeback_cache_control.txt @@ -71,7 +71,7 @@ requests that have a payload. For devices with volatile write caches the driver needs to tell the block layer that it supports flushing caches by doing: - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH); + blk_queue_write_cache(sdkp->disk->queue, true, false); and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that REQ_FLUSH requests with a payload are automatically turned into a sequence @@ -79,7 +79,7 @@ of an empty REQ_FLUSH request followed by the actual write by the block layer. For devices that also support the FUA bit the block layer needs to be told to pass through the REQ_FUA bit using: - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(sdkp->disk->queue, true, true); and the driver must handle write requests that have the REQ_FUA bit set in prep_fn/request_fn. If the FUA bit is not natively supported the block diff --git a/block/blk-settings.c b/block/blk-settings.c index c903bee43cf852..80d9327a214bac 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -820,26 +820,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); -/** - * blk_queue_flush - configure queue's cache flush capability - * @q: the request queue for the device - * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA - * - * Tell block layer cache flush capability of @q. If it supports - * flushing, REQ_FLUSH should be set. If it supports bypassing - * write cache for individual writes, REQ_FUA should be set. - */ -void blk_queue_flush(struct request_queue *q, unsigned int flush) -{ - WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); - - if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) - flush &= ~REQ_FUA; - - q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); -} -EXPORT_SYMBOL_GPL(blk_queue_flush); - void blk_queue_flush_queueable(struct request_queue *q, bool queueable) { q->flush_not_queueable = !queueable; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 18b7ff2184dd7d..2a4cfce5c66b6a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1009,7 +1009,6 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_flush(struct request_queue *q, unsigned int flush); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); From c48fcd8ac284d485afaa9bb51934f64b152f59e3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Apr 2016 13:33:19 -0600 Subject: [PATCH 21/28] block: kill off q->flush_flags Now that we converted everything to the newer block write cache interface, kill off the queue flush_flags and queueable flush entries. Signed-off-by: Jens Axboe --- block/blk-core.c | 3 ++- block/blk-flush.c | 11 ++++++----- block/blk-settings.c | 18 ++++++++++-------- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/md/dm-table.c | 12 ++++++------ drivers/md/raid5-cache.c | 3 ++- drivers/target/target_core_iblock.c | 6 +++--- include/linux/blkdev.h | 5 ++--- 8 files changed, 32 insertions(+), 28 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b60537b2c35b41..74c16fd8995d41 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1963,7 +1963,8 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); if (!nr_sectors) { err = 0; diff --git a/block/blk-flush.c b/block/blk-flush.c index 9c423e53324a29..b1c91d229e5ed9 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,17 +95,18 @@ enum { static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq); -static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; - if (fflags & REQ_FLUSH) { + if (fflags & (1UL << QUEUE_FLAG_WC)) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && + (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } return policy; @@ -384,7 +385,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) void blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); @@ -393,7 +394,7 @@ void blk_insert_flush(struct request *rq) * REQ_FLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_FLUSH; - if (!(fflags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA))) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 80d9327a214bac..f679ae12284351 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -822,7 +822,12 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment); void blk_queue_flush_queueable(struct request_queue *q, bool queueable) { - q->flush_not_queueable = !queueable; + spin_lock_irq(q->queue_lock); + if (queueable) + clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + else + set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); @@ -837,16 +842,13 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { spin_lock_irq(q->queue_lock); - if (wc) { + if (wc) queue_flag_set(QUEUE_FLAG_WC, q); - q->flush_flags = REQ_FLUSH; - } else + else queue_flag_clear(QUEUE_FLAG_WC, q); - if (fua) { - if (wc) - q->flush_flags |= REQ_FUA; + if (fua) queue_flag_set(QUEUE_FLAG_FUA, q); - } else + else queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 26aa080e243cb5..3355f1cdd4e5d6 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -477,7 +477,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->type |= VDISK_REMOVABLE; q = bdev_get_queue(bdev); - if (q && q->flush_flags) + if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; if (q && blk_queue_secdiscard(q)) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4b1ffc0abe111e..626a5ec044667a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1348,13 +1348,13 @@ static void dm_table_verify_integrity(struct dm_table *t) static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - unsigned flush = (*(unsigned *)data); + unsigned long flush = (unsigned long) data; struct request_queue *q = bdev_get_queue(dev->bdev); - return q && (q->flush_flags & flush); + return q && (q->queue_flags & flush); } -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { struct dm_target *ti; unsigned i = 0; @@ -1375,7 +1375,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) return true; if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, &flush)) + ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) return true; } @@ -1518,9 +1518,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - if (dm_table_supports_flush(t, REQ_FLUSH)) { + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; - if (dm_table_supports_flush(t, REQ_FUA)) + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) fua = true; } blk_queue_write_cache(q, wc, fua); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9531f5f05b93df..26f14970a858c9 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1188,6 +1188,7 @@ static int r5l_load_log(struct r5l_log *log) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { + struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; if (PAGE_SIZE != 4096) @@ -1197,7 +1198,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) return -ENOMEM; log->rdev = rdev; - log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); + log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, sizeof(rdev->mddev->uuid)); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 026a758e577867..7c4efb4417b0fc 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -687,10 +687,10 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, * Force writethrough using WRITE_FUA if a volatile write cache * is not enabled, or if initiator set the Force Unit Access bit. */ - if (q->flush_flags & REQ_FUA) { + if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) { if (cmd->se_cmd_flags & SCF_FUA) rw = WRITE_FUA; - else if (!(q->flush_flags & REQ_FLUSH)) + else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) rw = WRITE_FUA; else rw = WRITE; @@ -836,7 +836,7 @@ static bool iblock_get_write_cache(struct se_device *dev) struct block_device *bd = ib_dev->ibd_bd; struct request_queue *q = bdev_get_queue(bd); - return q->flush_flags & REQ_FLUSH; + return test_bit(QUEUE_FLAG_WC, &q->queue_flags); } static const struct target_backend_ops iblock_ops = { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2a4cfce5c66b6a..fc1894996b12ba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -433,8 +433,6 @@ struct request_queue { /* * for flush operations */ - unsigned int flush_flags; - unsigned int flush_not_queueable:1; struct blk_flush_queue *fq; struct list_head requeue_list; @@ -493,6 +491,7 @@ struct request_queue { #define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 23 /* Write back caching */ #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ +#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -1365,7 +1364,7 @@ static inline unsigned int block_size(struct block_device *bdev) static inline bool queue_flush_queueable(struct request_queue *q) { - return !q->flush_not_queueable; + return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); } typedef struct {struct page *v;} Sector; From 462ca58af90f301528d7e77eac347146254da90c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Apr 2016 09:51:10 -0600 Subject: [PATCH 22/28] block: add WRITE_BG This adds a new request flag, REQ_BG, that callers can use to tell the block layer that this is background (non-urgent) IO. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 +++- include/linux/fs.h | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 86a38ea1823f33..223012451c7a97 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -161,6 +161,7 @@ enum rq_flag_bits { __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_FLUSH, /* request for cache flush */ + __REQ_BG, /* background activity */ /* bio only flags */ __REQ_RAHEAD, /* read ahead, can fail anytime */ @@ -208,7 +209,7 @@ enum rq_flag_bits { #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ - REQ_SECURE | REQ_INTEGRITY) + REQ_SECURE | REQ_INTEGRITY | REQ_BG) #define REQ_CLONE_MASK REQ_COMMON_MASK #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) @@ -235,6 +236,7 @@ enum rq_flag_bits { #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) #define REQ_FLUSH (1ULL << __REQ_FLUSH) #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) +#define REQ_BG (1ULL << __REQ_BG) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) #define REQ_SECURE (1ULL << __REQ_SECURE) diff --git a/include/linux/fs.h b/include/linux/fs.h index 70e61b58baaf66..bb8f951cc619c5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -192,6 +192,9 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded * by a cache flush and data is guaranteed to be on * non-volatile media on completion. + * WRITE_BG Background write. This is for background activity like + * the periodic flush and background threshold writeback + * * */ #define RW_MASK REQ_WRITE @@ -207,6 +210,7 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) +#define WRITE_BG (WRITE | REQ_NOIDLE | REQ_BG) /* * Attribute flags. These should be or-ed together to figure out what From cb9ee03e2c6dc40e046a6d331406888303153614 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Apr 2016 09:33:14 -0600 Subject: [PATCH 23/28] writeback: add wbc_to_write_cmd() Add wbc_to_write_cmd(), which returns the write type to use, based on a struct writeback_control. No functional changes in this patch, but it prepares us for factoring other wbc fields for write type. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara --- fs/block_dev.c | 2 +- fs/buffer.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 2 +- fs/gfs2/meta_io.c | 3 +-- fs/mpage.c | 9 ++++----- fs/xfs/xfs_aops.c | 2 +- include/linux/writeback.h | 8 ++++++++ 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 20a2c02b77c452..8662da6aa07cfc 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,7 +432,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + int rw = wbc_to_write_cmd(wbc); const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) diff --git a/fs/buffer.c b/fs/buffer.c index af0d9a82a8edff..46763c58e786a6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1697,7 +1697,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = wbc_to_write_cmd(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5dafb9cef12e71..e4e81ce663c577 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1153,7 +1153,7 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .rw = wbc_to_write_cmd(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1a33de9d84b16a..3b377258dc0936 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1397,7 +1397,7 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .rw = wbc_to_write_cmd(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0448524c11bcfc..3fdfa3848f18b5 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = REQ_META | REQ_PRIO | wbc_to_write_cmd(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); diff --git a/fs/mpage.c b/fs/mpage.c index eedc644b78d783..bcbdb61b24f1ef 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -486,7 +486,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -595,7 +594,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); alloc_new: if (bio == NULL) { @@ -622,7 +621,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); goto alloc_new; } @@ -632,7 +631,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -644,7 +643,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, confused: if (bio) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e49b2406d15d20..e6c721f4153b62 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -393,7 +393,7 @@ xfs_submit_ioend_bio( atomic_inc(&ioend->io_remaining); bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); + submit_bio(wbc_to_write_cmd(wbc), bio); } STATIC struct bio * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d0b5ca5d4e0803..aa66fa05ff0d02 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -100,6 +100,14 @@ struct writeback_control { #endif }; +static inline int wbc_to_write_cmd(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return WRITE_SYNC; + + return WRITE; +} + /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global From 047b239711f877886d5bcd8c43a97748a879b702 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Apr 2016 09:53:24 -0600 Subject: [PATCH 24/28] writeback: use WRITE_BG for kupdate and background writeback If we're doing background type writes, then use the appropriate write command for that. Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index aa66fa05ff0d02..6e4a35acaa3e2c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -104,6 +104,8 @@ static inline int wbc_to_write_cmd(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) return WRITE_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + return WRITE_BG; return WRITE; } From a9b30274338565b1e0ae3f883f99999bf8ccda10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Apr 2016 09:35:44 -0600 Subject: [PATCH 25/28] writeback: track if we're sleeping on progress in balance_dirty_pages() Note in the bdi_writeback structure if a task is currently being limited in balance_dirty_pages(), waiting for writeback to proceed. Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ mm/backing-dev.c | 1 + mm/page-writeback.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 3f103076d0bfdc..1212c374b9288e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,8 @@ struct bdi_writeback { struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + atomic_t dirty_sleeping; /* waiting on dirty limit exceeded */ + struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0c6317b7db38a0..41db7dff11d027 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + atomic_set(&wb->dirty_sleeping, 0); wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); if (!wb->congested) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 999792d35ccc0f..028a3d4d712976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1746,7 +1746,9 @@ static void balance_dirty_pages(struct address_space *mapping, pause, start_time); __set_current_state(TASK_KILLABLE); + atomic_inc(&wb->dirty_sleeping); io_schedule_timeout(pause); + atomic_dec(&wb->dirty_sleeping); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; From 944f7bd5c9bf45057cb8e99c382b0b599292c8c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:21:08 -0600 Subject: [PATCH 26/28] block: add code to track actual device queue depth For blk-mq, ->nr_requests does track queue depth, at least at init time. But for the older queue paths, it's simply a soft setting. On top of that, it's generally larger than the hardware setting on purpose, to allow backup of requests for merging. Fill a hole in struct request with a 'queue_depth' member, that drivers can call to more closely inform the block layer of the real queue depth. Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 ++++++++++++ drivers/scsi/scsi.c | 3 +++ include/linux/blkdev.h | 11 +++++++++++ 3 files changed, 26 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index f679ae12284351..f7e122e717e804 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -831,6 +831,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); +/** + * blk_set_queue_depth - tell the block layer about the device queue depth + * @q: the request queue for the device + * @depth: queue depth + * + */ +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) +{ + q->queue_depth = depth; +} +EXPORT_SYMBOL(blk_set_queue_depth); + /** * blk_queue_write_cache - configure queue's write cache * @q: the request queue for the device diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1deb6adc411f79..75455d4dab68b5 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) wmb(); } + if (sdev->request_queue) + blk_set_queue_depth(sdev->request_queue, depth); + return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fc1894996b12ba..eee94bd6de52e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -315,6 +315,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -681,6 +683,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -984,6 +994,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, From 1d591a3196cb733168eb3c598141cf5641c0d1b3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Apr 2016 18:56:45 -0600 Subject: [PATCH 27/28] block: add scalable completion tracking of requests For legacy block, we simply track them in the request queue. For blk-mq, we track them on a per-sw queue basis, which we can then sum up through the hardware queues and finally to a per device state. The stats are tracked in, roughly, 0.1s interval windows. Add sysfs files to display the stats. Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-core.c | 4 + block/blk-mq-sysfs.c | 47 ++++++++++ block/blk-mq.c | 14 +++ block/blk-mq.h | 3 + block/blk-stat.c | 184 ++++++++++++++++++++++++++++++++++++++ block/blk-stat.h | 17 ++++ block/blk-sysfs.c | 26 ++++++ include/linux/blk_types.h | 8 ++ include/linux/blkdev.h | 4 + 10 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 block/blk-stat.c create mode 100644 block/blk-stat.h diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d40a..3446e0472df0b7 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index 74c16fd8995d41..40b57bf4852c32 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2514,6 +2514,8 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); + req->issue_time = ktime_to_ns(ktime_get()); + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2581,6 +2583,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) trace_block_rq_complete(req->q, req, nr_bytes); + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); + if (!req->bio) return false; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4ea4dd8a1eed5a..2f68015f8616f2 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[0]); + blk_stat_init(&stat[1]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { .attr = {.name = "io_poll", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 1699baf39b78a8..71b4a13fbf9478 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -29,6 +29,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -356,10 +357,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; + + blk_stat_add(stat, rq); +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -403,6 +413,8 @@ void blk_mq_start_request(struct request *rq) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + rq->issue_time = ktime_to_ns(ktime_get()); + blk_add_timer(rq); /* @@ -1761,6 +1773,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[0]); + blk_stat_init(&__ctx->stat[1]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b70a..e107f700ff1710 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -20,6 +22,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 00000000000000..b38776a8317347 --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,184 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include + +#include "blk-stat.h" +#include "blk-mq.h" + +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j, nr; + + blk_stat_init(&dst[0]); + blk_stat_init(&dst[1]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->time = time_now & BLK_STAT_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 delta, now, value; + + now = ktime_to_ns(ktime_get()); + if (now < rq->issue_time) + return; + + if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK)) + __blk_stat_init(stat, now); + + value = now - rq->issue_time; + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + delta = value - stat->mean; + if (delta) + stat->mean += div64_s64(delta, stat->nr_samples + 1); + + stat->nr_samples++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } + } + } else { + blk_stat_init(&q->rq_stats[0]); + blk_stat_init(&q->rq_stats[1]); + } +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 00000000000000..d77548dbf19603 --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,17 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *q); +void blk_stat_init(struct blk_rq_stat *); +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 99205965f5596c..6e516cc0d3d071 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -379,6 +379,26 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, return count; } +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -516,6 +536,11 @@ static struct queue_sysfs_entry queue_wc_entry = { .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -542,6 +567,7 @@ static struct attribute *default_attrs[] = { &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_stats_entry.attr, NULL, }; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 223012451c7a97..2b4414fb4d8e8c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -268,4 +268,12 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s64 nr_samples; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eee94bd6de52e5..87f6703ced71d7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -153,6 +153,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + s64 issue_time; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -402,6 +403,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the From 605f106a1b881fda334681a726103b49d6021c61 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Apr 2016 15:43:23 -0400 Subject: [PATCH 28/28] writeback: throttle buffered writeback Test patch that throttles buffered writeback to make it a lot more smooth, and has way less impact on other system activity. Background writeback should be, by definition, background activity. The fact that we flush huge bundles of it at the time means that it potentially has heavy impacts on foreground workloads, which isn't ideal. We can't easily limit the sizes of writes that we do, since that would impact file system layout in the presence of delayed allocation. So just throttle back buffered writeback, unless someone is waiting for it. The algorithm for when to throttle takes its inspiration in the CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors the minimum latencies of requests over a window of time. In that window of time, if the minimum latency of any request exceeds a given target, then a scale count is incremented and the queue depth is shrunk. The next monitoring window is shrunk accordingly. Unlike CoDel, if we hit a window that exhibits good behavior, then we simply increment the scale count and re-calculate the limits for that scale value. This prevents us from oscillating between a close-to-ideal value and max all the time, instead remaining in the windows where we get good behavior. The patch registers two sysfs entries. The first one, 'wb_lat_usec', sets the latency target for the window. It defaults to 2 msec for non-rotational storage, and 75 msec for rotational storage. Setting this value to '0' disables blk-wb. The second entry, 'wb_stats', is a debug entry, that simply shows the current internal state of the throttling machine: $ cat /sys/block/nvme0n1/queue/wb_stats background=16, normal=32, max=64, inflight=0, wait=0, bdp_wait=0 'background' denotes how many requests we will allow in-flight for idle background buffered writeback, 'normal' for higher priority writeback, and 'max' for when it's urgent we clean pages. 'inflight' shows how many requests are currently in-flight for buffered writeback, 'wait' shows if anyone is currently waiting for access, and 'bdp_wait' shows if someone is currently throttled on this device in balance_dirty_pages(). blk-wb also registers a few trace events, that can be used to monitor the state changes: block_wb_lat: latency 2446318 block_wb_stat: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1, wmean=518866, wmin=15522, wmax=5330353, wsamples=57 block_wb_step: step down: step=1, window=72727272, background=8, normal=16, max=32 'block_wb_lat' logs a violation in sync issue latency, 'block_wb_stat' logs a window violation of latencies and dumps the stats that lead to that, and finally, 'block_wb_stat' logs a step up/down and the new limits associated with that state. Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-core.c | 15 + block/blk-mq.c | 31 ++- block/blk-settings.c | 4 + block/blk-sysfs.c | 103 +++++++ block/blk-wb.c | 515 +++++++++++++++++++++++++++++++++++ block/blk-wb.h | 45 +++ include/linux/blk_types.h | 2 + include/linux/blkdev.h | 3 + include/trace/events/block.h | 101 +++++++ 10 files changed, 818 insertions(+), 3 deletions(-) create mode 100644 block/blk-wb.c create mode 100644 block/blk-wb.h diff --git a/block/Makefile b/block/Makefile index 3446e0472df0b7..7e4be7a56a59bf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o blk-wb.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index 40b57bf4852c32..d941f69dfb4bc1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + blk_wb_exit(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + blk_wb_requeue(q->rq_wb, rq); if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); @@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + blk_wb_done(q->rq_wb, req); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + bool wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock); + /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the @@ -1781,11 +1789,16 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + if (wb_acct) + req->cmd_flags |= REQ_BUF_INFLIGHT; + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req) blk_dequeue_request(req); req->issue_time = ktime_to_ns(ktime_get()); + blk_wb_issue(req->q->rq_wb, req); /* * We are now handing the request to the hardware, initialize @@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error) blk_unprep_request(req); blk_account_io_done(req); + blk_wb_done(req->q->rq_wb, req); if (req->end_io) req->end_io(req, error); diff --git a/block/blk-mq.c b/block/blk-mq.c index 71b4a13fbf9478..c0c5207fe7fdec 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-wb.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -275,6 +276,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + blk_wb_done(q->rq_wb, rq); + rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -305,6 +309,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); + blk_wb_done(rq->q->rq_wb, rq); if (rq->end_io) { rq->end_io(rq, error); @@ -414,6 +419,7 @@ void blk_mq_start_request(struct request *rq) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); rq->issue_time = ktime_to_ns(ktime_get()); + blk_wb_issue(q->rq_wb, rq); blk_add_timer(rq); @@ -450,6 +456,7 @@ static void __blk_mq_requeue_request(struct request *rq) struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + blk_wb_requeue(q->rq_wb, rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -1265,6 +1272,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1282,9 +1290,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + wb_acct = blk_wb_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1361,6 +1377,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_map_ctx data; struct request *rq; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1375,9 +1392,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, NULL)) return BLK_QC_T_NONE; + wb_acct = blk_wb_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -2111,6 +2136,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + blk_wb_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-settings.c b/block/blk-settings.c index f7e122e717e804..84bcfc22e02097 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -13,6 +13,7 @@ #include #include "blk.h" +#include "blk-wb.h" unsigned long blk_max_low_pfn; EXPORT_SYMBOL(blk_max_low_pfn); @@ -840,6 +841,9 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; + + if (q->rq_wb) + blk_wb_update_limits(q->rq_wb); } EXPORT_SYMBOL(blk_set_queue_depth); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6e516cc0d3d071..9ddce5ea41c7a8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" struct queue_sysfs_entry { struct attribute attr; @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } +static ssize_t queue_var_store64(u64 *var, const char *page) +{ + int err; + u64 v; + + err = kstrtou64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -347,6 +361,73 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_stats_show(struct request_queue *q, char *page) +{ + struct rq_wb *rwb = q->rq_wb; + + if (!rwb) + return -EINVAL; + + return sprintf(page, "background=%d, normal=%d, max=%d, inflight=%d," + " wait=%d, bdp_wait=%d\n", rwb->wb_background, + rwb->wb_normal, rwb->wb_max, + atomic_read(&rwb->inflight), + waitqueue_active(&rwb->wait), + atomic_read(rwb->bdp_wait)); +} + +static ssize_t queue_wb_win_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000)); +} + +static ssize_t queue_wb_win_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->win_nsec = val * 1000ULL; + blk_wb_update_limits(q->rq_wb); + return count; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->min_lat_nsec = val * 1000ULL; + blk_wb_update_limits(q->rq_wb); + return count; +} + static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -541,6 +622,23 @@ static struct queue_sysfs_entry queue_stats_entry = { .show = queue_stats_show, }; +static struct queue_sysfs_entry queue_wb_stats_entry = { + .attr = {.name = "wb_stats", .mode = S_IRUGO }, + .show = queue_wb_stats_show, +}; + +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wb_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + +static struct queue_sysfs_entry queue_wb_win_entry = { + .attr = {.name = "wb_win_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_win_show, + .store = queue_wb_win_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -568,6 +666,9 @@ static struct attribute *default_attrs[] = { &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_stats_entry.attr, + &queue_wb_stats_entry.attr, + &queue_wb_lat_entry.attr, + &queue_wb_win_entry.attr, NULL, }; @@ -721,6 +822,8 @@ int blk_register_queue(struct gendisk *disk) if (q->mq_ops) blk_mq_register_disk(disk); + blk_wb_init(q); + if (!q->request_fn) return 0; diff --git a/block/blk-wb.c b/block/blk-wb.c new file mode 100644 index 00000000000000..859cbb219ac598 --- /dev/null +++ b/block/blk-wb.c @@ -0,0 +1,515 @@ +/* + * buffered writeback throttling. losely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * + * Copyright (C) 2016 Jens Axboe + * + * Things that (may) need changing: + * + * - Different scaling of background/normal/high priority writeback. + * We may have to violate guarantees for max. + * - We can have mismatches between the stat window and our window. + * + */ +#include +#include +#include +#include + +#include "blk.h" +#include "blk-wb.h" +#include "blk-stat.h" + +enum { + /* + * Might need to be higher + */ + RWB_MAX_DEPTH = 64, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet these minimums + */ + RWB_MIN_WRITE_SAMPLES = 3, + RWB_MIN_READ_SAMPLES = 1, + + RWB_UNKNOWN_BUMP = 5, + + /* + * Target min latencies, in nsecs + */ + RWB_ROT_LAT = 75000000ULL, /* 75 msec */ + RWB_NONROT_LAT = 2000000ULL, /* 2 msec */ +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +void __blk_wb_done(struct rq_wb *rwb) +{ + int inflight, limit = rwb->wb_normal; + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && + !atomic_read(rwb->bdp_wait)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. If + * throttling got disabled (limit == 0) with waiters, ensure + * that we wake them up. + */ + inflight = atomic_dec_return(&rwb->inflight); + if (limit && inflight >= limit) { + if (!rwb->wb_max) + wake_up_all(&rwb->wait); + return; + } + + if (waitqueue_active(&rwb->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_nr(&rwb->wait, 1); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void blk_wb_done(struct rq_wb *rwb, struct request *rq) +{ + if (!rwb) + return; + + if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + wb_timestamp(rwb, &rwb->last_comp); + } else { + WARN_ON_ONCE(rq == rwb->sync_cookie); + __blk_wb_done(rwb); + rq->cmd_flags &= ~REQ_BUF_INFLIGHT; + } +} + +static void calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return; + } + + depth = min_t(unsigned int, RWB_MAX_DEPTH, blk_queue_depth(rwb->q)); + + /* + * Reduce max depth by 50%, and re-calculate normal/bg based on that + */ + rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step)); + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; +} + +static bool inline stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[0].nr_samples >= 1 && + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK, + LAT_UNKNOWN, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + u64 thislat; + + if (!stat_sample_valid(stat)) + return LAT_UNKNOWN; + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[0].min > rwb->min_lat_nsec) { + trace_block_wb_lat(stat[0].min); + trace_block_wb_stat(stat); + return LAT_EXCEEDED; + } + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { + trace_block_wb_lat(thislat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_block_wb_stat(stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + blk_queue_stat_get(rwb->q, stat); + + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + trace_block_wb_step(msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * If we're at 0, we can't go lower. + */ + if (!rwb->scale_step) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->q); + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); + + rwb_trace_step(rwb, "step up"); +} + +static void scale_down(struct rq_wb *rwb) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + rwb->scale_step++; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->q); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + /* + * We should speed this up, using some variant of a fast integer + * inverse square root calculation. Since we only do this for + * every window expiration, it's not a huge deal, though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void blk_wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + int status; + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + status = latency_exceeded(rwb); + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN: + /* + * We had no read samples, start bumping up the write + * depth slowly + */ + if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP) + scale_up(rwb); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || atomic_read(&rwb->inflight)) + rwb_arm_timer(rwb); +} + +void blk_wb_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If REQ_SYNC is + * set, then it's WB_SYNC_ALL writeback, and we'll use the max + * limit for that. If the write is marked as a background write, + * then use the idle limit, or go to normal if we haven't had + * competing IO for a bit. + */ + if ((rw & REQ_HIPRIO) || atomic_read(rwb->bdp_wait)) + limit = rwb->wb_max; + else if ((rw & REQ_BG) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __blk_wb_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rwb->inflight); + return true; + } + + return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __blk_wb_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) +{ + DEFINE_WAIT(wait); + + if (may_queue(rwb, rw)) + return; + + do { + prepare_to_wait_exclusive(&rwb->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rw)) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rwb->wait, &wait); +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +bool blk_wb_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +{ + /* + * If disabled, or not a WRITE (or a discard), do nothing + */ + if (!rwb_enabled(rwb) || !(bio->bi_rw & REQ_WRITE) || + (bio->bi_rw & REQ_DISCARD)) + goto no_q; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) + goto no_q; + + __blk_wb_wait(rwb, bio->bi_rw, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + return true; + +no_q: + wb_timestamp(rwb, &rwb->last_issue); + return false; +} + +void blk_wb_issue(struct rq_wb *rwb, struct request *rq) +{ + if (!rwb_enabled(rwb)) + return; + if (!(rq->cmd_flags & REQ_BUF_INFLIGHT) && !rwb->sync_issue) { + rwb->sync_cookie = rq; + rwb->sync_issue = rq->issue_time; + } +} + +void blk_wb_requeue(struct rq_wb *rwb, struct request *rq) +{ + if (!rwb_enabled(rwb)) + return; + if (rq == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void blk_wb_init(struct request_queue *q) +{ + struct rq_wb *rwb; + + /* + * If this fails, we don't get throttling + */ + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return; + + atomic_set(&rwb->inflight, 0); + init_waitqueue_head(&rwb->wait); + setup_timer(&rwb->window_timer, blk_wb_timer_fn, (unsigned long) rwb); + rwb->last_comp = rwb->last_issue = jiffies; + rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping; + rwb->q = q; + + if (blk_queue_nonrot(q)) + rwb->min_lat_nsec = RWB_NONROT_LAT; + else + rwb->min_lat_nsec = RWB_ROT_LAT; + + rwb->win_nsec = RWB_WINDOW_NSEC; + blk_wb_update_limits(rwb); + q->rq_wb = rwb; +} + +void blk_wb_exit(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb) { + del_timer_sync(&rwb->window_timer); + kfree(q->rq_wb); + q->rq_wb = NULL; + } +} diff --git a/block/blk-wb.h b/block/blk-wb.h new file mode 100644 index 00000000000000..222d9ef478d8f3 --- /dev/null +++ b/block/blk-wb.h @@ -0,0 +1,45 @@ +#ifndef BLK_WB_H +#define BLK_WB_H + +#include +#include +#include + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + unsigned int wb_max; /* max throughput writeback */ + unsigned int scale_step; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + unsigned int unknown_cnt; + + struct timer_list window_timer; + + s64 sync_issue; + void *sync_cookie; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + atomic_t *bdp_wait; + struct request_queue *q; + wait_queue_head_t wait; + atomic_t inflight; +}; + +void __blk_wb_done(struct rq_wb *); +void blk_wb_done(struct rq_wb *, struct request *); +bool blk_wb_wait(struct rq_wb *, struct bio *, spinlock_t *); +void blk_wb_init(struct request_queue *); +void blk_wb_exit(struct request_queue *); +void blk_wb_update_limits(struct rq_wb *); +void blk_wb_requeue(struct rq_wb *, struct request *); +void blk_wb_issue(struct rq_wb *, struct request *); + +#endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2b4414fb4d8e8c..c41f8a303804df 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -189,6 +189,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ + __REQ_BUF_INFLIGHT, /* track inflight for buffered */ __REQ_NR_BITS, /* stops here */ }; @@ -243,6 +244,7 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) +#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 87f6703ced71d7..230c55dc95aea9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -37,6 +37,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -291,6 +292,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other diff --git a/include/trace/events/block.h b/include/trace/events/block.h index e8a5eca1dbe578..4b8b795c12d969 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -667,6 +667,107 @@ TRACE_EVENT(block_rq_remap, (unsigned long long)__entry->old_sector, __entry->nr_bios) ); +/** + * block_wb_stat - trace stats for blk_wb + * @stat: array of read/write stats + */ +TRACE_EVENT(block_wb_stat, + + TP_PROTO(struct blk_rq_stat *stat), + + TP_ARGS(stat), + + TP_STRUCT__entry( + __field( s64, rmean ) + __field( u64, rmin ) + __field( u64, rmax ) + __field( s64, rnr_samples ) + __field( s64, rtime ) + __field( s64, wmean ) + __field( u64, wmin ) + __field( u64, wmax ) + __field( s64, wnr_samples ) + __field( s64, wtime ) + ), + + TP_fast_assign( + __entry->rmean = stat[0].mean; + __entry->rmin = stat[0].min; + __entry->rmax = stat[0].max; + __entry->rnr_samples = stat[0].nr_samples; + __entry->wmean = stat[1].mean; + __entry->wmin = stat[1].min; + __entry->wmax = stat[1].max; + __entry->wnr_samples = stat[1].nr_samples; + ), + + TP_printk("rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, " + "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n", + __entry->rmean, __entry->rmin, __entry->rmax, + __entry->rnr_samples, __entry->wmean, __entry->wmin, + __entry->wmax, __entry->wnr_samples) +); + +/** + * block_wb_lat - trace latency event + * @lat: latency trigger + */ +TRACE_EVENT(block_wb_lat, + + TP_PROTO(unsigned long lat), + + TP_ARGS(lat), + + TP_STRUCT__entry( + __field( unsigned long, lat ) + ), + + TP_fast_assign( + __entry->lat = lat; + ), + + TP_printk("latency %llu\n", (unsigned long long) __entry->lat) +); + +/** + * block_wb_step - trace wb event step + * @msg: context message + * @step: the current scale step count + * @window: the current monitoring window + * @bg: the current background queue limit + * @normal: the current normal writeback limit + * @max: the current max throughput writeback limit + */ +TRACE_EVENT(block_wb_step, + + TP_PROTO(const char *msg, unsigned int step, unsigned long window, + unsigned int bg, unsigned int normal, unsigned int max), + + TP_ARGS(msg, step, window, bg, normal, max), + + TP_STRUCT__entry( + __field( const char *, msg ) + __field( unsigned int, step ) + __field( unsigned long, window ) + __field( unsigned int, bg ) + __field( unsigned int, normal ) + __field( unsigned int, max ) + ), + + TP_fast_assign( + __entry->msg = msg; + __entry->step = step; + __entry->window = window; + __entry->bg = bg; + __entry->normal = normal; + __entry->max = max; + ), + + TP_printk("%s: step=%u, window=%lu, background=%u, normal=%u, max=%u\n", + __entry->msg, __entry->step, __entry->window, __entry->bg, + __entry->normal, __entry->max) +); + #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */