diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index e5d914845be6de..dce25d848d92c6 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -141,6 +141,15 @@ control of this block device to that new IO scheduler. Note that writing an IO scheduler name to this file will attempt to load that IO scheduler module, if it isn't already present in the system. +write_cache (RW) +---------------- +When read, this file will display whether the device has write back +caching enabled or not. It will return "write back" for the former +case, and "write through" for the latter. Writing to this file can +change the kernels view of the device, but it doesn't alter the +device state. This means that it might not be safe to toggle the +setting from "write back" to "write through", since that will also +eliminate cache flushes issued by the kernel. Jens Axboe , February 2009 diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt index 83407d36630a6d..59e0516cbf6b68 100644 --- a/Documentation/block/writeback_cache_control.txt +++ b/Documentation/block/writeback_cache_control.txt @@ -71,7 +71,7 @@ requests that have a payload. For devices with volatile write caches the driver needs to tell the block layer that it supports flushing caches by doing: - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH); + blk_queue_write_cache(sdkp->disk->queue, true, false); and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that REQ_FLUSH requests with a payload are automatically turned into a sequence @@ -79,7 +79,7 @@ of an empty REQ_FLUSH request followed by the actual write by the block layer. For devices that also support the FUA bit the block layer needs to be told to pass through the REQ_FUA bit using: - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(sdkp->disk->queue, true, true); and the driver must handle write requests that have the REQ_FUA bit set in prep_fn/request_fn. If the FUA bit is not natively supported the block diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 39ba20755e0356..17e96dc29596cc 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -862,7 +862,7 @@ static int ubd_add(int n, char **error_out) goto out; } ubd_dev->queue->queuedata = ubd_dev; - blk_queue_flush(ubd_dev->queue, REQ_FLUSH); + blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_max_segments(ubd_dev->queue, MAX_SG); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d40a..7e4be7a56a59bf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o blk-wb.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index b60537b2c35b41..d941f69dfb4bc1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + blk_wb_exit(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + blk_wb_requeue(q->rq_wb, rq); if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); @@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + blk_wb_done(q->rq_wb, req); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + bool wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock); + /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the @@ -1781,11 +1789,16 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + if (wb_acct) + req->cmd_flags |= REQ_BUF_INFLIGHT; + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -1963,7 +1976,8 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); if (!nr_sectors) { err = 0; @@ -2513,6 +2527,9 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); + req->issue_time = ktime_to_ns(ktime_get()); + blk_wb_issue(req->q->rq_wb, req); + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2580,6 +2597,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) trace_block_rq_complete(req->q, req, nr_bytes); + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); + if (!req->bio) return false; @@ -2746,6 +2765,7 @@ void blk_finish_request(struct request *req, int error) blk_unprep_request(req); blk_account_io_done(req); + blk_wb_done(req->q->rq_wb, req); if (req->end_io) req->end_io(req, error); diff --git a/block/blk-flush.c b/block/blk-flush.c index 9c423e53324a29..b1c91d229e5ed9 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,17 +95,18 @@ enum { static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq); -static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; - if (fflags & REQ_FLUSH) { + if (fflags & (1UL << QUEUE_FLAG_WC)) { if (rq->cmd_flags & REQ_FLUSH) policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && + (rq->cmd_flags & REQ_FUA)) policy |= REQ_FSEQ_POSTFLUSH; } return policy; @@ -384,7 +385,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) void blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); @@ -393,7 +394,7 @@ void blk_insert_flush(struct request *rq) * REQ_FLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_FLUSH; - if (!(fflags & REQ_FUA)) + if (!(fflags & (1UL << QUEUE_FLAG_FUA))) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4ea4dd8a1eed5a..2f68015f8616f2 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[0]); + blk_stat_init(&stat[1]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { .attr = {.name = "io_poll", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 1699baf39b78a8..c0c5207fe7fdec 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -29,6 +29,8 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" +#include "blk-wb.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -274,6 +276,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->cmd_flags & REQ_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + blk_wb_done(q->rq_wb, rq); + rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -304,6 +309,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); + blk_wb_done(rq->q->rq_wb, rq); if (rq->end_io) { rq->end_io(rq, error); @@ -356,10 +362,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; + + blk_stat_add(stat, rq); +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -403,6 +418,9 @@ void blk_mq_start_request(struct request *rq) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + rq->issue_time = ktime_to_ns(ktime_get()); + blk_wb_issue(q->rq_wb, rq); + blk_add_timer(rq); /* @@ -438,6 +456,7 @@ static void __blk_mq_requeue_request(struct request *rq) struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + blk_wb_requeue(q->rq_wb, rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -1253,6 +1272,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1270,9 +1290,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + wb_acct = blk_wb_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1349,6 +1377,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_map_ctx data; struct request *rq; blk_qc_t cookie; + bool wb_acct; blk_queue_bounce(q, &bio); @@ -1363,9 +1392,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, NULL)) return BLK_QC_T_NONE; + wb_acct = blk_wb_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (wb_acct) + __blk_wb_done(q->rq_wb); return BLK_QC_T_NONE; + } + + if (wb_acct) + rq->cmd_flags |= REQ_BUF_INFLIGHT; cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1761,6 +1798,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[0]); + blk_stat_init(&__ctx->stat[1]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) @@ -2097,6 +2136,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + blk_wb_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b70a..e107f700ff1710 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -20,6 +22,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-settings.c b/block/blk-settings.c index 331e4eee0dda0c..84bcfc22e02097 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -13,6 +13,7 @@ #include #include "blk.h" +#include "blk-wb.h" unsigned long blk_max_low_pfn; EXPORT_SYMBOL(blk_max_low_pfn); @@ -820,31 +821,54 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); +void blk_queue_flush_queueable(struct request_queue *q, bool queueable) +{ + spin_lock_irq(q->queue_lock); + if (queueable) + clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + else + set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + /** - * blk_queue_flush - configure queue's cache flush capability + * blk_set_queue_depth - tell the block layer about the device queue depth * @q: the request queue for the device - * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA + * @depth: queue depth * - * Tell block layer cache flush capability of @q. If it supports - * flushing, REQ_FLUSH should be set. If it supports bypassing - * write cache for individual writes, REQ_FUA should be set. */ -void blk_queue_flush(struct request_queue *q, unsigned int flush) +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { - WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); - - if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) - flush &= ~REQ_FUA; + q->queue_depth = depth; - q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); + if (q->rq_wb) + blk_wb_update_limits(q->rq_wb); } -EXPORT_SYMBOL_GPL(blk_queue_flush); +EXPORT_SYMBOL(blk_set_queue_depth); -void blk_queue_flush_queueable(struct request_queue *q, bool queueable) -{ - q->flush_not_queueable = !queueable; -} -EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); +/** + * blk_queue_write_cache - configure queue's write cache + * @q: the request queue for the device + * @wc: write back cache on or off + * @fua: device supports FUA writes, if true + * + * Tell the block layer about the write cache of @q. + */ +void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) +{ + spin_lock_irq(q->queue_lock); + if (wc) + queue_flag_set(QUEUE_FLAG_WC, q); + else + queue_flag_clear(QUEUE_FLAG_WC, q); + if (fua) + queue_flag_set(QUEUE_FLAG_FUA, q); + else + queue_flag_clear(QUEUE_FLAG_FUA, q); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_write_cache); static int __init blk_settings_init(void) { diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 00000000000000..b38776a8317347 --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,184 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include + +#include "blk-stat.h" +#include "blk-mq.h" + +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j, nr; + + blk_stat_init(&dst[0]); + blk_stat_init(&dst[1]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->time = time_now & BLK_STAT_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 delta, now, value; + + now = ktime_to_ns(ktime_get()); + if (now < rq->issue_time) + return; + + if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK)) + __blk_stat_init(stat, now); + + value = now - rq->issue_time; + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + delta = value - stat->mean; + if (delta) + stat->mean += div64_s64(delta, stat->nr_samples + 1); + + stat->nr_samples++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } + } + } else { + blk_stat_init(&q->rq_stats[0]); + blk_stat_init(&q->rq_stats[1]); + } +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 00000000000000..d77548dbf19603 --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,17 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *q); +void blk_stat_init(struct blk_rq_stat *); +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 995b58d46ed109..9ddce5ea41c7a8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wb.h" struct queue_sysfs_entry { struct attribute attr; @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } +static ssize_t queue_var_store64(u64 *var, const char *page) +{ + int err; + u64 v; + + err = kstrtou64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -347,6 +361,125 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_stats_show(struct request_queue *q, char *page) +{ + struct rq_wb *rwb = q->rq_wb; + + if (!rwb) + return -EINVAL; + + return sprintf(page, "background=%d, normal=%d, max=%d, inflight=%d," + " wait=%d, bdp_wait=%d\n", rwb->wb_background, + rwb->wb_normal, rwb->wb_max, + atomic_read(&rwb->inflight), + waitqueue_active(&rwb->wait), + atomic_read(rwb->bdp_wait)); +} + +static ssize_t queue_wb_win_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000)); +} + +static ssize_t queue_wb_win_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->win_nsec = val * 1000ULL; + blk_wb_update_limits(q->rq_wb); + return count; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->min_lat_nsec = val * 1000ULL; + blk_wb_update_limits(q->rq_wb); + return count; +} + +static ssize_t queue_wc_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return sprintf(page, "write back\n"); + + return sprintf(page, "write through\n"); +} + +static ssize_t queue_wc_store(struct request_queue *q, const char *page, + size_t count) +{ + int set = -1; + + if (!strncmp(page, "write back", 10)) + set = 1; + else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) + set = 0; + + if (set == -1) + return -EINVAL; + + spin_lock_irq(q->queue_lock); + if (set) + queue_flag_set(QUEUE_FLAG_WC, q); + else + queue_flag_clear(QUEUE_FLAG_WC, q); + spin_unlock_irq(q->queue_lock); + + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -478,6 +611,34 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_wc_entry = { + .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wc_show, + .store = queue_wc_store, +}; + +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + +static struct queue_sysfs_entry queue_wb_stats_entry = { + .attr = {.name = "wb_stats", .mode = S_IRUGO }, + .show = queue_wb_stats_show, +}; + +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wb_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + +static struct queue_sysfs_entry queue_wb_win_entry = { + .attr = {.name = "wb_win_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_win_show, + .store = queue_wb_win_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -503,6 +664,11 @@ static struct attribute *default_attrs[] = { &queue_iostats_entry.attr, &queue_random_entry.attr, &queue_poll_entry.attr, + &queue_wc_entry.attr, + &queue_stats_entry.attr, + &queue_wb_stats_entry.attr, + &queue_wb_lat_entry.attr, + &queue_wb_win_entry.attr, NULL, }; @@ -656,6 +822,8 @@ int blk_register_queue(struct gendisk *disk) if (q->mq_ops) blk_mq_register_disk(disk); + blk_wb_init(q); + if (!q->request_fn) return 0; diff --git a/block/blk-wb.c b/block/blk-wb.c new file mode 100644 index 00000000000000..859cbb219ac598 --- /dev/null +++ b/block/blk-wb.c @@ -0,0 +1,515 @@ +/* + * buffered writeback throttling. losely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * + * Copyright (C) 2016 Jens Axboe + * + * Things that (may) need changing: + * + * - Different scaling of background/normal/high priority writeback. + * We may have to violate guarantees for max. + * - We can have mismatches between the stat window and our window. + * + */ +#include +#include +#include +#include + +#include "blk.h" +#include "blk-wb.h" +#include "blk-stat.h" + +enum { + /* + * Might need to be higher + */ + RWB_MAX_DEPTH = 64, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet these minimums + */ + RWB_MIN_WRITE_SAMPLES = 3, + RWB_MIN_READ_SAMPLES = 1, + + RWB_UNKNOWN_BUMP = 5, + + /* + * Target min latencies, in nsecs + */ + RWB_ROT_LAT = 75000000ULL, /* 75 msec */ + RWB_NONROT_LAT = 2000000ULL, /* 2 msec */ +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +void __blk_wb_done(struct rq_wb *rwb) +{ + int inflight, limit = rwb->wb_normal; + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && + !atomic_read(rwb->bdp_wait)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. If + * throttling got disabled (limit == 0) with waiters, ensure + * that we wake them up. + */ + inflight = atomic_dec_return(&rwb->inflight); + if (limit && inflight >= limit) { + if (!rwb->wb_max) + wake_up_all(&rwb->wait); + return; + } + + if (waitqueue_active(&rwb->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_nr(&rwb->wait, 1); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void blk_wb_done(struct rq_wb *rwb, struct request *rq) +{ + if (!rwb) + return; + + if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + wb_timestamp(rwb, &rwb->last_comp); + } else { + WARN_ON_ONCE(rq == rwb->sync_cookie); + __blk_wb_done(rwb); + rq->cmd_flags &= ~REQ_BUF_INFLIGHT; + } +} + +static void calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return; + } + + depth = min_t(unsigned int, RWB_MAX_DEPTH, blk_queue_depth(rwb->q)); + + /* + * Reduce max depth by 50%, and re-calculate normal/bg based on that + */ + rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step)); + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; +} + +static bool inline stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[0].nr_samples >= 1 && + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK, + LAT_UNKNOWN, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + u64 thislat; + + if (!stat_sample_valid(stat)) + return LAT_UNKNOWN; + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[0].min > rwb->min_lat_nsec) { + trace_block_wb_lat(stat[0].min); + trace_block_wb_stat(stat); + return LAT_EXCEEDED; + } + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { + trace_block_wb_lat(thislat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_block_wb_stat(stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + blk_queue_stat_get(rwb->q, stat); + + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + trace_block_wb_step(msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * If we're at 0, we can't go lower. + */ + if (!rwb->scale_step) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->q); + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); + + rwb_trace_step(rwb, "step up"); +} + +static void scale_down(struct rq_wb *rwb) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + rwb->scale_step++; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->q); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + /* + * We should speed this up, using some variant of a fast integer + * inverse square root calculation. Since we only do this for + * every window expiration, it's not a huge deal, though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void blk_wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + int status; + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + status = latency_exceeded(rwb); + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN: + /* + * We had no read samples, start bumping up the write + * depth slowly + */ + if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP) + scale_up(rwb); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || atomic_read(&rwb->inflight)) + rwb_arm_timer(rwb); +} + +void blk_wb_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If REQ_SYNC is + * set, then it's WB_SYNC_ALL writeback, and we'll use the max + * limit for that. If the write is marked as a background write, + * then use the idle limit, or go to normal if we haven't had + * competing IO for a bit. + */ + if ((rw & REQ_HIPRIO) || atomic_read(rwb->bdp_wait)) + limit = rwb->wb_max; + else if ((rw & REQ_BG) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __blk_wb_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rwb->inflight); + return true; + } + + return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __blk_wb_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) +{ + DEFINE_WAIT(wait); + + if (may_queue(rwb, rw)) + return; + + do { + prepare_to_wait_exclusive(&rwb->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rw)) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rwb->wait, &wait); +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +bool blk_wb_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +{ + /* + * If disabled, or not a WRITE (or a discard), do nothing + */ + if (!rwb_enabled(rwb) || !(bio->bi_rw & REQ_WRITE) || + (bio->bi_rw & REQ_DISCARD)) + goto no_q; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) + goto no_q; + + __blk_wb_wait(rwb, bio->bi_rw, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + return true; + +no_q: + wb_timestamp(rwb, &rwb->last_issue); + return false; +} + +void blk_wb_issue(struct rq_wb *rwb, struct request *rq) +{ + if (!rwb_enabled(rwb)) + return; + if (!(rq->cmd_flags & REQ_BUF_INFLIGHT) && !rwb->sync_issue) { + rwb->sync_cookie = rq; + rwb->sync_issue = rq->issue_time; + } +} + +void blk_wb_requeue(struct rq_wb *rwb, struct request *rq) +{ + if (!rwb_enabled(rwb)) + return; + if (rq == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void blk_wb_init(struct request_queue *q) +{ + struct rq_wb *rwb; + + /* + * If this fails, we don't get throttling + */ + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return; + + atomic_set(&rwb->inflight, 0); + init_waitqueue_head(&rwb->wait); + setup_timer(&rwb->window_timer, blk_wb_timer_fn, (unsigned long) rwb); + rwb->last_comp = rwb->last_issue = jiffies; + rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping; + rwb->q = q; + + if (blk_queue_nonrot(q)) + rwb->min_lat_nsec = RWB_NONROT_LAT; + else + rwb->min_lat_nsec = RWB_ROT_LAT; + + rwb->win_nsec = RWB_WINDOW_NSEC; + blk_wb_update_limits(rwb); + q->rq_wb = rwb; +} + +void blk_wb_exit(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb) { + del_timer_sync(&rwb->window_timer); + kfree(q->rq_wb); + q->rq_wb = NULL; + } +} diff --git a/block/blk-wb.h b/block/blk-wb.h new file mode 100644 index 00000000000000..222d9ef478d8f3 --- /dev/null +++ b/block/blk-wb.h @@ -0,0 +1,45 @@ +#ifndef BLK_WB_H +#define BLK_WB_H + +#include +#include +#include + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + unsigned int wb_max; /* max throughput writeback */ + unsigned int scale_step; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + unsigned int unknown_cnt; + + struct timer_list window_timer; + + s64 sync_issue; + void *sync_cookie; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + atomic_t *bdp_wait; + struct request_queue *q; + wait_queue_head_t wait; + atomic_t inflight; +}; + +void __blk_wb_done(struct rq_wb *); +void blk_wb_done(struct rq_wb *, struct request *); +bool blk_wb_wait(struct rq_wb *, struct bio *, spinlock_t *); +void blk_wb_init(struct request_queue *); +void blk_wb_exit(struct request_queue *); +void blk_wb_update_limits(struct rq_wb *); +void blk_wb_requeue(struct rq_wb *, struct request *); +void blk_wb_issue(struct rq_wb *, struct request *); + +#endif diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index fa209773d494a4..2ba1494b279997 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2761,7 +2761,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info.congested_data = device; blk_queue_make_request(q, drbd_make_request); - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 80cf8add46ff36..1fa8cc235977f4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -943,7 +943,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_flush(lo->lo_queue, REQ_FLUSH); + blk_queue_write_cache(lo->lo_queue, true, false); loop_update_dio(lo); set_capacity(lo->lo_disk, size); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 25824c1697c508..c383e90ec69c02 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4023,12 +4023,6 @@ static int mtip_block_initialize(struct driver_data *dd) blk_queue_io_min(dd->queue, 4096); blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask); - /* - * write back cache is not supported in the device. FUA depends on - * write back cache support, hence setting flush support to zero. - */ - blk_queue_flush(dd->queue, 0); - /* Signal trim support */ if (dd->trim_supp == true) { set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 08afbc7a2bb871..31e73a7a40f20e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -693,9 +693,9 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) if (nbd->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); if (nbd->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + blk_queue_write_cache(nbd->disk->queue, true, false); else - blk_queue_flush(nbd->disk->queue, 0); + blk_queue_write_cache(nbd->disk->queue, false, false); } static int nbd_dev_dbg_init(struct nbd_device *nbd); diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 1b709a4e3b5ec6..c2854a2bfdb0bd 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -437,7 +437,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); blk_queue_prep_rq(q, blk_queue_start_tag); - blk_queue_flush(q, REQ_FLUSH); + blk_queue_write_cache(q, true, false); disk->queue = q; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index c120d70d3fb3b3..4b7e405830d7ec 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -468,7 +468,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_dma_alignment(queue, dev->blk_size-1); blk_queue_logical_block_size(queue, dev->blk_size); - blk_queue_flush(queue, REQ_FLUSH); + blk_queue_write_cache(queue, true, false); blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 586f9168ffa482..35276becd24e94 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4412,7 +4412,7 @@ static int skd_cons_disk(struct skd_device *skdev) disk->queue = q; q->queuedata = skdev; - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(q, true, true); blk_queue_max_segments(q, skdev->sgs_per_request); blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 28cff0d23d829e..42758b52768cf8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -493,11 +493,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) u8 writeback = virtblk_get_cache_mode(vdev); struct virtio_blk *vblk = vdev->priv; - if (writeback) - blk_queue_flush(vblk->disk->queue, REQ_FLUSH); - else - blk_queue_flush(vblk->disk->queue, 0); - + blk_queue_write_cache(vblk->disk->queue, writeback, false); revalidate_disk(vblk->disk); } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 26aa080e243cb5..3355f1cdd4e5d6 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -477,7 +477,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->type |= VDISK_REMOVABLE; q = bdev_get_queue(bdev); - if (q && q->flush_flags) + if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; if (q && blk_queue_secdiscard(q)) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6405b655779268..ca13df8546396c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -998,7 +998,8 @@ static const char *flush_info(unsigned int feature_flush) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_flush(info->rq, info->feature_flush); + blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH, + info->feature_flush & REQ_FUA); pr_info("blkfront: %s: %s %s %s %s %s\n", info->gd->disk_name, flush_info(info->feature_flush), "persistent grants:", info->feature_persistent ? diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 37a8a907febeb2..05dbcce70b0e33 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -522,7 +522,7 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect) static void update_flush(ide_drive_t *drive) { u16 *id = drive->id; - unsigned flush = 0; + bool wc = false; if (drive->dev_flags & IDE_DFLAG_WCACHE) { unsigned long long capacity; @@ -546,12 +546,12 @@ static void update_flush(ide_drive_t *drive) drive->name, barrier ? "" : "not "); if (barrier) { - flush = REQ_FLUSH; + wc = true; blk_queue_prep_rq(drive->queue, idedisk_prep_fn); } } - blk_queue_flush(drive->queue, flush); + blk_queue_write_cache(drive->queue, wc, false); } ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a296425a7270df..f5dbb4e884d893 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); - blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + blk_queue_write_cache(q, true, true); return 0; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9e8f0bef3323e..626a5ec044667a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1348,13 +1348,13 @@ static void dm_table_verify_integrity(struct dm_table *t) static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - unsigned flush = (*(unsigned *)data); + unsigned long flush = (unsigned long) data; struct request_queue *q = bdev_get_queue(dev->bdev); - return q && (q->flush_flags & flush); + return q && (q->queue_flags & flush); } -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { struct dm_target *ti; unsigned i = 0; @@ -1375,7 +1375,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) return true; if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, &flush)) + ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) return true; } @@ -1506,7 +1506,7 @@ static bool dm_table_supports_discards(struct dm_table *t) void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - unsigned flush = 0; + bool wc = false, fua = false; /* * Copy table's limits to the DM device's request_queue @@ -1518,12 +1518,12 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - if (dm_table_supports_flush(t, REQ_FLUSH)) { - flush |= REQ_FLUSH; - if (dm_table_supports_flush(t, REQ_FUA)) - flush |= REQ_FUA; + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { + wc = true; + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) + fua = true; } - blk_queue_flush(q, flush); + blk_queue_write_cache(q, wc, fua); if (!dm_table_discard_zeroes_data(t)) q->limits.discard_zeroes_data = 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 194580fba7fd8b..5d61e76cec343d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5037,7 +5037,7 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(mddev->queue, true, true); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9531f5f05b93df..26f14970a858c9 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1188,6 +1188,7 @@ static int r5l_load_log(struct r5l_log *log) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { + struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; if (PAGE_SIZE != 4096) @@ -1197,7 +1198,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) return -ENOMEM; log->rdev = rdev; - log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); + log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, sizeof(rdev->mddev->uuid)); diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 8a0147dfed27d8..8db2bf0a89df4c 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -2271,7 +2271,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || card->ext_csd.rel_sectors)) { md->flags |= MMC_BLK_REL_WR; - blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(md->queue.queue, true, true); } if (mmc_card_mmc(card) && diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f4701182b558ec..74ae24364a8db3 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -409,7 +409,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto error3; if (tr->flush) - blk_queue_flush(new->rq, REQ_FLUSH); + blk_queue_write_cache(new->rq, true, false); new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 643f457131c24f..c4f9feb0cd5179 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -894,6 +894,8 @@ EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { + bool vwc = false; + if (ctrl->max_hw_sectors) { u32 max_segments = (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; @@ -903,9 +905,10 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, } if (ctrl->stripe_size) blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(q, REQ_FLUSH | REQ_FUA); blk_queue_virt_boundary(q, ctrl->page_size - 1); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); } /* diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1deb6adc411f79..75455d4dab68b5 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) wmb(); } + if (sdev->request_queue) + blk_set_queue_depth(sdev->request_queue, depth); + return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f52b74cf8d1e69..ff3b7e65843ce0 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -137,15 +137,15 @@ static const char *sd_cache_types[] = { static void sd_set_flush_flag(struct scsi_disk *sdkp) { - unsigned flush = 0; + bool wc = false, fua = false; if (sdkp->WCE) { - flush |= REQ_FLUSH; + wc = true; if (sdkp->DPOFUA) - flush |= REQ_FUA; + fua = true; } - blk_queue_flush(sdkp->disk->queue, flush); + blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 026a758e577867..7c4efb4417b0fc 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -687,10 +687,10 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, * Force writethrough using WRITE_FUA if a volatile write cache * is not enabled, or if initiator set the Force Unit Access bit. */ - if (q->flush_flags & REQ_FUA) { + if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) { if (cmd->se_cmd_flags & SCF_FUA) rw = WRITE_FUA; - else if (!(q->flush_flags & REQ_FLUSH)) + else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) rw = WRITE_FUA; else rw = WRITE; @@ -836,7 +836,7 @@ static bool iblock_get_write_cache(struct se_device *dev) struct block_device *bd = ib_dev->ibd_bd; struct request_queue *q = bdev_get_queue(bd); - return q->flush_flags & REQ_FLUSH; + return test_bit(QUEUE_FLAG_WC, &q->queue_flags); } static const struct target_backend_ops iblock_ops = { diff --git a/fs/block_dev.c b/fs/block_dev.c index 20a2c02b77c452..8662da6aa07cfc 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,7 +432,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + int rw = wbc_to_write_cmd(wbc); const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) diff --git a/fs/buffer.c b/fs/buffer.c index af0d9a82a8edff..46763c58e786a6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1697,7 +1697,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = wbc_to_write_cmd(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5dafb9cef12e71..e4e81ce663c577 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1153,7 +1153,7 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .rw = wbc_to_write_cmd(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1a33de9d84b16a..3b377258dc0936 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1397,7 +1397,7 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .rw = wbc_to_write_cmd(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0448524c11bcfc..3fdfa3848f18b5 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = REQ_META | REQ_PRIO | wbc_to_write_cmd(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); diff --git a/fs/mpage.c b/fs/mpage.c index eedc644b78d783..bcbdb61b24f1ef 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -486,7 +486,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -595,7 +594,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); alloc_new: if (bio == NULL) { @@ -622,7 +621,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); goto alloc_new; } @@ -632,7 +631,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -644,7 +643,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, confused: if (bio) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e49b2406d15d20..e6c721f4153b62 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -393,7 +393,7 @@ xfs_submit_ioend_bio( atomic_inc(&ioend->io_remaining); bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); + submit_bio(wbc_to_write_cmd(wbc), bio); } STATIC struct bio * diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 3f103076d0bfdc..1212c374b9288e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,8 @@ struct bdi_writeback { struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + atomic_t dirty_sleeping; /* waiting on dirty limit exceeded */ + struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 86a38ea1823f33..c41f8a303804df 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -161,6 +161,7 @@ enum rq_flag_bits { __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_FLUSH, /* request for cache flush */ + __REQ_BG, /* background activity */ /* bio only flags */ __REQ_RAHEAD, /* read ahead, can fail anytime */ @@ -188,6 +189,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ + __REQ_BUF_INFLIGHT, /* track inflight for buffered */ __REQ_NR_BITS, /* stops here */ }; @@ -208,7 +210,7 @@ enum rq_flag_bits { #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ - REQ_SECURE | REQ_INTEGRITY) + REQ_SECURE | REQ_INTEGRITY | REQ_BG) #define REQ_CLONE_MASK REQ_COMMON_MASK #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) @@ -235,12 +237,14 @@ enum rq_flag_bits { #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) #define REQ_FLUSH (1ULL << __REQ_FLUSH) #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) +#define REQ_BG (1ULL << __REQ_BG) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) #define REQ_SECURE (1ULL << __REQ_SECURE) #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) +#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U @@ -266,4 +270,12 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s64 nr_samples; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 669e419d62347e..230c55dc95aea9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -37,6 +37,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -153,6 +154,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + s64 issue_time; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -290,6 +292,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other @@ -315,6 +319,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -400,6 +406,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -433,8 +442,6 @@ struct request_queue { /* * for flush operations */ - unsigned int flush_flags; - unsigned int flush_not_queueable:1; struct blk_flush_queue *fq; struct list_head requeue_list; @@ -491,6 +498,9 @@ struct request_queue { #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ #define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ +#define QUEUE_FLAG_WC 23 /* Write back caching */ +#define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ +#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -680,6 +690,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -983,6 +1001,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, @@ -1007,8 +1026,8 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_flush(struct request_queue *q, unsigned int flush); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); +extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); @@ -1363,7 +1382,7 @@ static inline unsigned int block_size(struct block_device *bdev) static inline bool queue_flush_queueable(struct request_queue *q) { - return !q->flush_not_queueable; + return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); } typedef struct {struct page *v;} Sector; diff --git a/include/linux/fs.h b/include/linux/fs.h index 70e61b58baaf66..bb8f951cc619c5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -192,6 +192,9 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded * by a cache flush and data is guaranteed to be on * non-volatile media on completion. + * WRITE_BG Background write. This is for background activity like + * the periodic flush and background threshold writeback + * * */ #define RW_MASK REQ_WRITE @@ -207,6 +210,7 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) +#define WRITE_BG (WRITE | REQ_NOIDLE | REQ_BG) /* * Attribute flags. These should be or-ed together to figure out what diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d0b5ca5d4e0803..6e4a35acaa3e2c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -100,6 +100,16 @@ struct writeback_control { #endif }; +static inline int wbc_to_write_cmd(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return WRITE_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + return WRITE_BG; + + return WRITE; +} + /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global diff --git a/include/trace/events/block.h b/include/trace/events/block.h index e8a5eca1dbe578..4b8b795c12d969 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -667,6 +667,107 @@ TRACE_EVENT(block_rq_remap, (unsigned long long)__entry->old_sector, __entry->nr_bios) ); +/** + * block_wb_stat - trace stats for blk_wb + * @stat: array of read/write stats + */ +TRACE_EVENT(block_wb_stat, + + TP_PROTO(struct blk_rq_stat *stat), + + TP_ARGS(stat), + + TP_STRUCT__entry( + __field( s64, rmean ) + __field( u64, rmin ) + __field( u64, rmax ) + __field( s64, rnr_samples ) + __field( s64, rtime ) + __field( s64, wmean ) + __field( u64, wmin ) + __field( u64, wmax ) + __field( s64, wnr_samples ) + __field( s64, wtime ) + ), + + TP_fast_assign( + __entry->rmean = stat[0].mean; + __entry->rmin = stat[0].min; + __entry->rmax = stat[0].max; + __entry->rnr_samples = stat[0].nr_samples; + __entry->wmean = stat[1].mean; + __entry->wmin = stat[1].min; + __entry->wmax = stat[1].max; + __entry->wnr_samples = stat[1].nr_samples; + ), + + TP_printk("rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, " + "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n", + __entry->rmean, __entry->rmin, __entry->rmax, + __entry->rnr_samples, __entry->wmean, __entry->wmin, + __entry->wmax, __entry->wnr_samples) +); + +/** + * block_wb_lat - trace latency event + * @lat: latency trigger + */ +TRACE_EVENT(block_wb_lat, + + TP_PROTO(unsigned long lat), + + TP_ARGS(lat), + + TP_STRUCT__entry( + __field( unsigned long, lat ) + ), + + TP_fast_assign( + __entry->lat = lat; + ), + + TP_printk("latency %llu\n", (unsigned long long) __entry->lat) +); + +/** + * block_wb_step - trace wb event step + * @msg: context message + * @step: the current scale step count + * @window: the current monitoring window + * @bg: the current background queue limit + * @normal: the current normal writeback limit + * @max: the current max throughput writeback limit + */ +TRACE_EVENT(block_wb_step, + + TP_PROTO(const char *msg, unsigned int step, unsigned long window, + unsigned int bg, unsigned int normal, unsigned int max), + + TP_ARGS(msg, step, window, bg, normal, max), + + TP_STRUCT__entry( + __field( const char *, msg ) + __field( unsigned int, step ) + __field( unsigned long, window ) + __field( unsigned int, bg ) + __field( unsigned int, normal ) + __field( unsigned int, max ) + ), + + TP_fast_assign( + __entry->msg = msg; + __entry->step = step; + __entry->window = window; + __entry->bg = bg; + __entry->normal = normal; + __entry->max = max; + ), + + TP_printk("%s: step=%u, window=%lu, background=%u, normal=%u, max=%u\n", + __entry->msg, __entry->step, __entry->window, __entry->bg, + __entry->normal, __entry->max) +); + #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0c6317b7db38a0..41db7dff11d027 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + atomic_set(&wb->dirty_sleeping, 0); wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); if (!wb->congested) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 999792d35ccc0f..028a3d4d712976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1746,7 +1746,9 @@ static void balance_dirty_pages(struct address_space *mapping, pause, start_time); __set_current_state(TASK_KILLABLE); + atomic_inc(&wb->dirty_sleeping); io_schedule_timeout(pause); + atomic_dec(&wb->dirty_sleeping); current->dirty_paused_when = now + pause; current->nr_dirtied = 0;