Skip to content

Commit 29d1280

Browse files
committed
Merge patch series "enable bs > ps for block devices"
Luis Chamberlain <mcgrof@kernel.org> says: This v3 series addresses the feedback from the v2 series [0]. The only patch which was mofified was the patch titled "fs/mpage: use blocks_per_folio instead of blocks_per_page". The motivation for this series is to mainly start supporting block devices with logical block sizes larger than 4k, we do this by addressing buffer-head support required for the block device cache. In the future these changes can be leveraged to also start experimenting with LBS support for filesystems which support only buffer-heads. This paves the way for that work. Its perhaps is surprising to some but since this also lifts the block device cache sector size support to 64k, devices which support up to 64k sector sizes can also leverage this to enable filesystems created with larger sector sizes up to 64k sector sizes. The filesystem sector size is used or documented in a bit of obscurity except for few filesystems, but in short it ensures that the filesystem itself will not generate writes iteslef smaller than the specified sector size. In practice this means you can constrain metadata writes as well to a minimum size, and so be completely deterministic with regards to the specified sector size for min IO writes. For example since XFS can supports up to 32k sector size, it means with these changes enable filesystems to also be created on x86_64 with both the filesystem block size and sector size to 32k, now that the block device cache limitation is lifted. Since this touches buffer-heads I've ran this through fstests on ext4 and found no new regressions. I've also used blktests against a kernel built with these changes to test block devices with different larger logical block sizes than 4k on x86_64. All changes to be able to test block devices with a logical block size support > 4k are now merged on upstream blktests. I've tested the block layer with blktests with block devices with logical block sizes up to 64k which is the max we are currently supporting and found no new regressions. Detailed changes in this series: - Modifies the commit log for "fs/buffer: remove batching from async read" as per Willy's request and collects his SOB. - Collects Reviewed-by tags - The patch titled "fs/mpage: use blocks_per_folio instead of blocks_per_page" received more love to account for Willy's point that we should keep accounting in order for nr_pages on mpage. This does this by using folio_nr_pages() on the args passed and adjusts the last_block accounting accordingly. - Through code inspection fixed folio_zero_segment() use to use folio_size() as we move to suppor large folios for unmapped folio segments on do_mpage_readpage(), this is dealt with on the patch titled "fs/mpage: use blocks_per_folio instead of blocks_per_page" as that's when we start accounting large folios into the picture. [0] https://lkml.kernel.org/r/20250204231209.429356-1-mcgrof@kernel.org * patches from https://lore.kernel.org/r/20250221223823.1680616-1-mcgrof@kernel.org: bdev: use bdev_io_min() for statx block size block/bdev: lift block size restrictions to 64k block/bdev: enable large folio support for large logical block sizes fs/buffer fs/mpage: remove large folio restriction fs/mpage: use blocks_per_folio instead of blocks_per_page fs/mpage: avoid negative shift for large blocksize fs/buffer: remove batching from async read fs/buffer: simplify block_read_full_folio() with bh_offset() Link: https://lore.kernel.org/r/20250221223823.1680616-1-mcgrof@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 2014c95 + 425fbcd commit 29d1280

File tree

4 files changed

+59
-67
lines changed

4 files changed

+59
-67
lines changed

block/bdev.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ static void set_init_blocksize(struct block_device *bdev)
148148
bsize <<= 1;
149149
}
150150
BD_INODE(bdev)->i_blkbits = blksize_bits(bsize);
151+
mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping,
152+
get_order(bsize));
151153
}
152154

153155
int set_blocksize(struct file *file, int size)
@@ -169,6 +171,7 @@ int set_blocksize(struct file *file, int size)
169171
if (inode->i_blkbits != blksize_bits(size)) {
170172
sync_blockdev(bdev);
171173
inode->i_blkbits = blksize_bits(size);
174+
mapping_set_folio_min_order(inode->i_mapping, get_order(size));
172175
kill_bdev(bdev);
173176
}
174177
return 0;
@@ -180,8 +183,7 @@ int sb_set_blocksize(struct super_block *sb, int size)
180183
{
181184
if (set_blocksize(sb->s_bdev_file, size))
182185
return 0;
183-
/* If we get here, we know size is power of two
184-
* and it's value is between 512 and PAGE_SIZE */
186+
/* If we get here, we know size is validated */
185187
sb->s_blocksize = size;
186188
sb->s_blocksize_bits = blksize_bits(size);
187189
return sb->s_blocksize;
@@ -1274,9 +1276,6 @@ void bdev_statx(struct path *path, struct kstat *stat,
12741276
struct inode *backing_inode;
12751277
struct block_device *bdev;
12761278

1277-
if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC)))
1278-
return;
1279-
12801279
backing_inode = d_backing_inode(path->dentry);
12811280

12821281
/*
@@ -1303,6 +1302,8 @@ void bdev_statx(struct path *path, struct kstat *stat,
13031302
queue_atomic_write_unit_max_bytes(bd_queue));
13041303
}
13051304

1305+
stat->blksize = bdev_io_min(bdev);
1306+
13061307
blkdev_put_no_open(bdev);
13071308
}
13081309

fs/buffer.c

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2361,9 +2361,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
23612361
{
23622362
struct inode *inode = folio->mapping->host;
23632363
sector_t iblock, lblock;
2364-
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2364+
struct buffer_head *bh, *head, *prev = NULL;
23652365
size_t blocksize;
2366-
int nr, i;
23672366
int fully_mapped = 1;
23682367
bool page_error = false;
23692368
loff_t limit = i_size_read(inode);
@@ -2372,16 +2371,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
23722371
if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
23732372
limit = inode->i_sb->s_maxbytes;
23742373

2375-
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2376-
23772374
head = folio_create_buffers(folio, inode, 0);
23782375
blocksize = head->b_size;
23792376

23802377
iblock = div_u64(folio_pos(folio), blocksize);
23812378
lblock = div_u64(limit + blocksize - 1, blocksize);
23822379
bh = head;
2383-
nr = 0;
2384-
i = 0;
23852380

23862381
do {
23872382
if (buffer_uptodate(bh))
@@ -2398,7 +2393,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
23982393
page_error = true;
23992394
}
24002395
if (!buffer_mapped(bh)) {
2401-
folio_zero_range(folio, i * blocksize,
2396+
folio_zero_range(folio, bh_offset(bh),
24022397
blocksize);
24032398
if (!err)
24042399
set_buffer_uptodate(bh);
@@ -2411,40 +2406,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
24112406
if (buffer_uptodate(bh))
24122407
continue;
24132408
}
2414-
arr[nr++] = bh;
2415-
} while (i++, iblock++, (bh = bh->b_this_page) != head);
2416-
2417-
if (fully_mapped)
2418-
folio_set_mappedtodisk(folio);
2419-
2420-
if (!nr) {
2421-
/*
2422-
* All buffers are uptodate or get_block() returned an
2423-
* error when trying to map them - we can finish the read.
2424-
*/
2425-
folio_end_read(folio, !page_error);
2426-
return 0;
2427-
}
24282409

2429-
/* Stage two: lock the buffers */
2430-
for (i = 0; i < nr; i++) {
2431-
bh = arr[i];
24322410
lock_buffer(bh);
2411+
if (buffer_uptodate(bh)) {
2412+
unlock_buffer(bh);
2413+
continue;
2414+
}
2415+
24332416
mark_buffer_async_read(bh);
2434-
}
2417+
if (prev)
2418+
submit_bh(REQ_OP_READ, prev);
2419+
prev = bh;
2420+
} while (iblock++, (bh = bh->b_this_page) != head);
2421+
2422+
if (fully_mapped)
2423+
folio_set_mappedtodisk(folio);
24352424

24362425
/*
2437-
* Stage 3: start the IO. Check for uptodateness
2438-
* inside the buffer lock in case another process reading
2439-
* the underlying blockdev brought it uptodate (the sct fix).
2426+
* All buffers are uptodate or get_block() returned an error
2427+
* when trying to map them - we must finish the read because
2428+
* end_buffer_async_read() will never be called on any buffer
2429+
* in this folio.
24402430
*/
2441-
for (i = 0; i < nr; i++) {
2442-
bh = arr[i];
2443-
if (buffer_uptodate(bh))
2444-
end_buffer_async_read(bh, 1);
2445-
else
2446-
submit_bh(REQ_OP_READ, bh);
2447-
}
2431+
if (prev)
2432+
submit_bh(REQ_OP_READ, prev);
2433+
else
2434+
folio_end_read(folio, !page_error);
2435+
24482436
return 0;
24492437
}
24502438
EXPORT_SYMBOL(block_read_full_folio);

fs/mpage.c

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
107107
* don't make any buffers if there is only one buffer on
108108
* the folio and the folio just needs to be set up to date
109109
*/
110-
if (inode->i_blkbits == PAGE_SHIFT &&
110+
if (inode->i_blkbits == folio_shift(folio) &&
111111
buffer_uptodate(bh)) {
112112
folio_mark_uptodate(folio);
113113
return;
@@ -153,15 +153,15 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
153153
struct folio *folio = args->folio;
154154
struct inode *inode = folio->mapping->host;
155155
const unsigned blkbits = inode->i_blkbits;
156-
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
156+
const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
157157
const unsigned blocksize = 1 << blkbits;
158158
struct buffer_head *map_bh = &args->map_bh;
159159
sector_t block_in_file;
160160
sector_t last_block;
161161
sector_t last_block_in_file;
162162
sector_t first_block;
163163
unsigned page_block;
164-
unsigned first_hole = blocks_per_page;
164+
unsigned first_hole = blocks_per_folio;
165165
struct block_device *bdev = NULL;
166166
int length;
167167
int fully_mapped = 1;
@@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
170170
unsigned relative_block;
171171
gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
172172

173-
/* MAX_BUF_PER_PAGE, for example */
174-
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
175-
176173
if (args->is_readahead) {
177174
opf |= REQ_RAHEAD;
178175
gfp |= __GFP_NORETRY | __GFP_NOWARN;
@@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
181178
if (folio_buffers(folio))
182179
goto confused;
183180

184-
block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
185-
last_block = block_in_file + args->nr_pages * blocks_per_page;
181+
block_in_file = folio_pos(folio) >> blkbits;
182+
last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits);
186183
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
187184
if (last_block > last_block_in_file)
188185
last_block = last_block_in_file;
@@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
204201
clear_buffer_mapped(map_bh);
205202
break;
206203
}
207-
if (page_block == blocks_per_page)
204+
if (page_block == blocks_per_folio)
208205
break;
209206
page_block++;
210207
block_in_file++;
@@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
216213
* Then do more get_blocks calls until we are done with this folio.
217214
*/
218215
map_bh->b_folio = folio;
219-
while (page_block < blocks_per_page) {
216+
while (page_block < blocks_per_folio) {
220217
map_bh->b_state = 0;
221218
map_bh->b_size = 0;
222219

@@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
229226

230227
if (!buffer_mapped(map_bh)) {
231228
fully_mapped = 0;
232-
if (first_hole == blocks_per_page)
229+
if (first_hole == blocks_per_folio)
233230
first_hole = page_block;
234231
page_block++;
235232
block_in_file++;
@@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
247244
goto confused;
248245
}
249246

250-
if (first_hole != blocks_per_page)
247+
if (first_hole != blocks_per_folio)
251248
goto confused; /* hole -> non-hole */
252249

253250
/* Contiguous blocks? */
@@ -260,16 +257,16 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
260257
if (relative_block == nblocks) {
261258
clear_buffer_mapped(map_bh);
262259
break;
263-
} else if (page_block == blocks_per_page)
260+
} else if (page_block == blocks_per_folio)
264261
break;
265262
page_block++;
266263
block_in_file++;
267264
}
268265
bdev = map_bh->b_bdev;
269266
}
270267

271-
if (first_hole != blocks_per_page) {
272-
folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
268+
if (first_hole != blocks_per_folio) {
269+
folio_zero_segment(folio, first_hole << blkbits, folio_size(folio));
273270
if (first_hole == 0) {
274271
folio_mark_uptodate(folio);
275272
folio_unlock(folio);
@@ -303,10 +300,10 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
303300
relative_block = block_in_file - args->first_logical_block;
304301
nblocks = map_bh->b_size >> blkbits;
305302
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
306-
(first_hole != blocks_per_page))
303+
(first_hole != blocks_per_folio))
307304
args->bio = mpage_bio_submit_read(args->bio);
308305
else
309-
args->last_block_in_bio = first_block + blocks_per_page - 1;
306+
args->last_block_in_bio = first_block + blocks_per_folio - 1;
310307
out:
311308
return args->bio;
312309

@@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
385382
{
386383
struct mpage_readpage_args args = {
387384
.folio = folio,
388-
.nr_pages = 1,
385+
.nr_pages = folio_nr_pages(folio),
389386
.get_block = get_block,
390387
};
391388

@@ -456,12 +453,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
456453
struct address_space *mapping = folio->mapping;
457454
struct inode *inode = mapping->host;
458455
const unsigned blkbits = inode->i_blkbits;
459-
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
456+
const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
460457
sector_t last_block;
461458
sector_t block_in_file;
462459
sector_t first_block;
463460
unsigned page_block;
464-
unsigned first_unmapped = blocks_per_page;
461+
unsigned first_unmapped = blocks_per_folio;
465462
struct block_device *bdev = NULL;
466463
int boundary = 0;
467464
sector_t boundary_block = 0;
@@ -486,12 +483,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
486483
*/
487484
if (buffer_dirty(bh))
488485
goto confused;
489-
if (first_unmapped == blocks_per_page)
486+
if (first_unmapped == blocks_per_folio)
490487
first_unmapped = page_block;
491488
continue;
492489
}
493490

494-
if (first_unmapped != blocks_per_page)
491+
if (first_unmapped != blocks_per_folio)
495492
goto confused; /* hole -> non-hole */
496493

497494
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
@@ -527,7 +524,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
527524
* The page has no buffers: map it to disk
528525
*/
529526
BUG_ON(!folio_test_uptodate(folio));
530-
block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
527+
block_in_file = folio_pos(folio) >> blkbits;
531528
/*
532529
* Whole page beyond EOF? Skip allocating blocks to avoid leaking
533530
* space.
@@ -536,7 +533,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
536533
goto page_is_mapped;
537534
last_block = (i_size - 1) >> blkbits;
538535
map_bh.b_folio = folio;
539-
for (page_block = 0; page_block < blocks_per_page; ) {
536+
for (page_block = 0; page_block < blocks_per_folio; ) {
540537

541538
map_bh.b_state = 0;
542539
map_bh.b_size = 1 << blkbits;
@@ -618,14 +615,14 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
618615
BUG_ON(folio_test_writeback(folio));
619616
folio_start_writeback(folio);
620617
folio_unlock(folio);
621-
if (boundary || (first_unmapped != blocks_per_page)) {
618+
if (boundary || (first_unmapped != blocks_per_folio)) {
622619
bio = mpage_bio_submit_write(bio);
623620
if (boundary_block) {
624621
write_boundary_block(boundary_bdev,
625622
boundary_block, 1 << blkbits);
626623
}
627624
} else {
628-
mpd->last_block_in_bio = first_block + blocks_per_page - 1;
625+
mpd->last_block_in_bio = first_block + blocks_per_folio - 1;
629626
}
630627
goto out;
631628

include/linux/blkdev.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,16 @@ static inline dev_t disk_devt(struct gendisk *disk)
267267
return MKDEV(disk->major, disk->first_minor);
268268
}
269269

270+
/*
271+
* We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER)
272+
* however we constrain this to what we can validate and test.
273+
*/
274+
#define BLK_MAX_BLOCK_SIZE SZ_64K
275+
270276
/* blk_validate_limits() validates bsize, so drivers don't usually need to */
271277
static inline int blk_validate_block_size(unsigned long bsize)
272278
{
273-
if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
279+
if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize))
274280
return -EINVAL;
275281

276282
return 0;

0 commit comments

Comments
 (0)