From 5614293e320823c8107513d6ad55a34339f736d9 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 7 Nov 2025 19:54:21 +0100 Subject: [PATCH 001/161] kbuild: create .prelink.mod files for modpost When modpost processes .prelink.o files during module linking, it looks for corresponding .mod files that list the object files comprising each module. However, the build system only creates .mod files for the original module names, not for the .prelink.o filenames. This causes modpost to fail with "No such file or directory" errors when trying to open the .prelink.mod files, breaking module compilation for any module using the prelink extension. Fix this by adding a Makefile rule that automatically creates .prelink.mod files by copying the original .mod files, and add these files as dependencies to the modpost rule. Fixes: modpost failures during module compilation with .prelink.o files Signed-off-by: Bernd Schubert --- scripts/Makefile.modpost | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index eef56d629799a0..b4ddbab0daedeb 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -130,7 +130,14 @@ endif quiet_cmd_modpost = MODPOST $@ cmd_modpost = sed 's/\.ko$$/$(mod-prelink-ext)\.o/' $< | $(MODPOST) -T - -$(output-symdump): $(MODORDER) $(input-symdump) $(modules:.ko=$(mod-prelink-ext).o) FORCE +# Create .prelink.mod files for modpost to find +quiet_cmd_prelink_mod = GEN $@ + cmd_prelink_mod = cp $(patsubst %.prelink.mod,%.mod,$@) $@ + +$(modules:.ko=$(mod-prelink-ext).mod): %$(mod-prelink-ext).mod: %.mod + $(call if_changed,prelink_mod) + +$(output-symdump): $(MODORDER) $(input-symdump) $(modules:.ko=$(mod-prelink-ext).o) $(modules:.ko=$(mod-prelink-ext).mod) FORCE $(call if_changed,modpost) targets += $(output-symdump) From 150947d064cfcf6ba4bd071dfeb4357e8876f86f Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 15 Apr 2024 19:04:21 -0600 Subject: [PATCH 002/161] fuse: remove reliance on bdi congestion commit 670d21c6e17f67535fcf16e14c772209220da9ae Author: NeilBrown Date: Tue Mar 22 14:38:58 2022 -0700 fuse: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. Fuse is one of a small number of filesystems that uses it, setting both the sync (read) and async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the sync flag is to cause read-ahead to be skipped. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flags, change: - .readahead to stop when it has submitted all non-async pages for read. - .writepages to do nothing if WB_SYNC_NONE and the flag would be set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag would be set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) will further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983737.9187.2627117501000365074.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds JIRA: https://issues.redhat.com/browse/RHEL-5619 Signed-off-by: Nico Pache (cherry picked from commit cab0fc3481597dc4ce5a7f15662f2d77faefcb79) --- fs/fuse/control.c | 17 ----------------- fs/fuse/dev.c | 8 -------- fs/fuse/file.c | 17 +++++++++++++++++ 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 000d2e5627e99c..7cede9a3bc9623 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; - struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - - /* - * Get any fuse_mount belonging to this fuse_conn; s_bdi is - * shared between all of them - */ - - if (!list_empty(&fc->mounts)) { - fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); - if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } else { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } - } spin_unlock(&fc->bg_lock); up_read(&fc->killsb); fuse_conn_put(fc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b9e79f0f66b3d9..3569b670fea9b5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fm->sb) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } fc->num_background--; fc->active_background--; flush_bg_queue(fc); @@ -542,10 +538,6 @@ static bool fuse_request_queue_background(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fm->sb) { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ceb50463e4bb24..08ef9b2bd3e17a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -978,6 +978,14 @@ static void fuse_readahead(struct readahead_control *rac) struct fuse_io_args *ia; struct fuse_args_pages *ap; + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + break; + nr_pages = readahead_count(rac) - nr_pages; if (nr_pages > max_pages) nr_pages = max_pages; @@ -2002,6 +2010,7 @@ static int fuse_writepage_locked(struct page *page) static int fuse_writepage(struct page *page, struct writeback_control *wbc) { + struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; if (fuse_page_is_writeback(page->mapping->host, page->index)) { @@ -2017,6 +2026,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) return 0; } + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return AOP_WRITEPAGE_ACTIVATE; + err = fuse_writepage_locked(page); unlock_page(page); @@ -2270,6 +2283,10 @@ static int fuse_writepages(struct address_space *mapping, if (fuse_is_bad(inode)) goto out; + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; + data.inode = inode; data.wpa = NULL; data.ff = NULL; From dddde6ee90d6f8a7799b39cc9c756e2b41d939e2 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Tue, 16 Apr 2024 17:45:20 -0600 Subject: [PATCH 003/161] fuse: remove unneeded lock which protecting update of congestion_threshold commit efc4105a4cf9e300b8e9150147415fa235059293 Author: Kemeng Shi Date: Sat Oct 7 23:39:56 2023 +0800 fuse: remove unneeded lock which protecting update of congestion_threshold Commit 670d21c6e17f6 ("fuse: remove reliance on bdi congestion") change how congestion_threshold is used and lock in fuse_conn_congestion_threshold_write is not needed anymore. 1. Access to supe_block is removed along with removing of bdi congestion. Then down_read(&fc->killsb) which protecting access to super_block is no needed. 2. Compare num_background and congestion_threshold without holding bg_lock. Then there is no need to hold bg_lock to update congestion_threshold. Signed-off-by: Kemeng Shi Signed-off-by: Miklos Szeredi JIRA: https://issues.redhat.com/browse/RHEL-5619 Signed-off-by: Nico Pache (cherry picked from commit 168fb63082f782582230f1101ebdab249d05e67f) --- fs/fuse/control.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 7cede9a3bc9623..4632e7e84ba61c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - down_read(&fc->killsb); - spin_lock(&fc->bg_lock); - fc->congestion_threshold = val; - spin_unlock(&fc->bg_lock); - up_read(&fc->killsb); + WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; From 30e5a69a2d411d21efb1817a5e928946de06c24f Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Wed, 11 Dec 2024 18:06:00 -0500 Subject: [PATCH 004/161] fuse: Initialize beyond-EOF page contents before setting uptodate jira LE-2157 cve CVE-2024-44947 Rebuild_History Non-Buildable kernel-5.14.0-503.14.1.el9_5 commit-author Jann Horn commit 3c0da3d163eb32f1f91891efaade027fa9b245b9 fuse_notify_store(), unlike fuse_do_readpage(), does not enable page zeroing (because it can be used to change partial page contents). So fuse_notify_store() must be more careful to fully initialize page contents (including parts of the page that are beyond end-of-file) before marking the page uptodate. The current code can leave beyond-EOF page contents uninitialized, which makes these uninitialized page contents visible to userspace via mmap(). This is an information leak, but only affects systems which do not enable init-on-alloc (via CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y or the corresponding kernel command line parameter). Link: https://bugs.chromium.org/p/project-zero/issues/detail?id=2574 Cc: stable@kernel.org Fixes: a1d75f258230 ("fuse: add store request") Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds (cherry picked from commit 3c0da3d163eb32f1f91891efaade027fa9b245b9) Signed-off-by: Jonathan Maple (cherry picked from commit 04ad3640e1a699d576c9a9d34a00dc8cfade8cae) --- fs/fuse/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3569b670fea9b5..c014a59f27e61f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1612,9 +1612,11 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) + if (!PageUptodate(page) && !err && offset == 0 && + (this_num == PAGE_SIZE || file_size == end)) { + zero_user_segment(page, this_num, PAGE_SIZE); SetPageUptodate(page); + } unlock_page(page); put_page(page); From 460d6faf795486c9bd4d06bcc00ed94a6bcbbe44 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 1 Jun 2023 16:59:02 +0200 Subject: [PATCH 005/161] fuse: update ki_pos in fuse_perform_write JIRA: https://issues.redhat.com/browse/RHEL-29564 commit 70e986c3b4f43ae7096be6de9a39c947f182e0c0 Author: Christoph Hellwig Date: Thu Jun 1 16:59:02 2023 +0200 fuse: update ki_pos in fuse_perform_write Both callers of fuse_perform_write need to updated ki_pos, move it into common code. Link: https://lkml.kernel.org/r/20230601145904.1385409-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: "Darrick J. Wong" Cc: Hannes Reinecke Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton Signed-off-by: Ming Lei (cherry picked from commit 466935797247dbcd55a2e757d376b031621320d6) --- fs/fuse/file.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 08ef9b2bd3e17a..4b4ae41057e9cb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1290,7 +1290,10 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); fuse_invalidate_attr(inode); - return res > 0 ? res : err; + if (!res) + return err; + iocb->ki_pos += res; + return res; } static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1302,7 +1305,6 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = mapping->host; ssize_t err; struct fuse_conn *fc = get_fuse_conn(inode); - loff_t endbyte = 0; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1337,19 +1339,20 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos = iocb->ki_pos; + loff_t pos, endbyte; + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; - pos += written; - - written_buffered = fuse_perform_write(iocb, mapping, from, pos); + written_buffered = fuse_perform_write(iocb, mapping, from, + iocb->ki_pos); if (written_buffered < 0) { err = written_buffered; goto out; } - endbyte = pos + written_buffered - 1; + pos = iocb->ki_pos - written_buffered; + endbyte = iocb->ki_pos - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); @@ -1361,11 +1364,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) endbyte >> PAGE_SHIFT); written += written_buffered; - iocb->ki_pos = pos + written_buffered; } else { written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); - if (written >= 0) - iocb->ki_pos += written; } out: current->backing_dev_info = NULL; From 00950fd864f0be2946d11dbc1104928b1ca37e41 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 1 Jun 2023 16:59:03 +0200 Subject: [PATCH 006/161] fuse: drop redundant arguments to fuse_perform_write JIRA: https://issues.redhat.com/browse/RHEL-29564 commit 596df33d673d9d816b60b95088e59a76d845c254 Author: Christoph Hellwig Date: Thu Jun 1 16:59:03 2023 +0200 fuse: drop redundant arguments to fuse_perform_write pos is always equal to iocb->ki_pos, and mapping is always equal to iocb->ki_filp->f_mapping. Link: https://lkml.kernel.org/r/20230601145904.1385409-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Miklos Szeredi Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: "Darrick J. Wong" Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton Signed-off-by: Ming Lei (cherry picked from commit 69700d9e97f6d33c15aa0841e7bc111905f4342c) --- fs/fuse/file.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4b4ae41057e9cb..2febab42e73883 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1238,13 +1238,13 @@ static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, max_pages); } -static ssize_t fuse_perform_write(struct kiocb *iocb, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + loff_t pos = iocb->ki_pos; int err = 0; ssize_t res = 0; @@ -1345,8 +1345,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (written < 0 || !iov_iter_count(from)) goto out; - written_buffered = fuse_perform_write(iocb, mapping, from, - iocb->ki_pos); + written_buffered = fuse_perform_write(iocb, from); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1365,7 +1364,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) written += written_buffered; } else { - written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); + written = fuse_perform_write(iocb, from); } out: current->backing_dev_info = NULL; From ed05787b32da9c2e149151c583213abfd462fdd2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 21 Apr 2021 17:18:39 +0200 Subject: [PATCH 007/161] fuse: Convert to using invalidate_lock Use invalidate_lock instead of fuse's private i_mmap_sem. The intended purpose is exactly the same. By this conversion we fix a long standing race between hole punching and read(2) / readahead(2) paths that can lead to stale page cache contents. CC: Miklos Szeredi Reviewed-by: Miklos Szeredi Signed-off-by: Jan Kara (cherry picked from commit 8bcbbe9c7c8e49281fc2e0a6c5455b87c85a9c2a) (cherry picked from commit 23d6d8374bb9b51fd89b13802f59fb5c72bd9a98) --- fs/fuse/dax.c | 50 +++++++++++++++++++++++------------------------- fs/fuse/dir.c | 11 ++++++----- fs/fuse/file.c | 10 +++++----- fs/fuse/fuse_i.h | 7 ------- fs/fuse/inode.c | 1 - 5 files changed, 35 insertions(+), 44 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 578484c3bdc27e..7d29b8817f1926 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, /* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And - * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. - * In fault path we enter with fi->i_mmap_sem held and can't drop - * it. Also in fault path we hold fi->i_mmap_sem shared and not - * exclusive, so that creates further issues with fuse_wait_dax_page(). - * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory - * range to become free and retry. + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. */ if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fcd); @@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, idx, idx); - /* We are holding either inode lock or i_mmap_sem, and that should + /* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and @@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = { static void fuse_wait_dax_page(struct inode *inode) { - struct fuse_inode *fi = get_fuse_inode(inode); - - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with fi->i_mmap_sem lock held exclusively */ +/* Should be called with mapping->invalidate_lock held exclusively */ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, loff_t start, loff_t end) { @@ -813,18 +811,18 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free. */ - down_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); goto retry; } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(sb); @@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, int ret; struct interval_tree_node *node; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Lookup a dmap and corresponding file offset to reclaim. */ down_read(&fi->dax->sem); @@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, out_write_dmap_sem: up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return dmap; } @@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise - * if a deadlock is possible if we sleep with fi->i_mmap_sem - * held and worker to free memory can't make progress due - * to unavailability of fi->i_mmap_sem lock. So sleep - * only if fi->dax->nr=0 + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 */ if (retry) continue; @@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding - * fi->i_mmap_sem, worker should still be able to free up - * a range and wake us up. + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. */ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq, @@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, /* * Free a range of memory. * Locking: - * 1. Take fi->i_mmap_sem to block dax faults. + * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing. */ @@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", @@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index a61d339056856a..ee021ec097002a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1722,6 +1722,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1746,11 +1747,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (FUSE_IS_DAX(inode) && is_truncate) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, 0); if (err) { - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } } @@ -1860,13 +1861,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return 0; @@ -1877,7 +1878,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2febab42e73883..f3b125afc72037 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -240,7 +240,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) inode_lock(inode); if (dax_truncate) { - down_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out_inode_unlock; @@ -264,7 +264,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) invalidate_inode_pages2(inode->i_mapping); } if (dax_truncate) - up_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); out_inode_unlock: if (is_wb_truncate || dax_truncate) inode_unlock(inode); @@ -3001,7 +3001,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -3060,7 +3060,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (block_faults) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); @@ -3128,7 +3128,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * - * To fix this a i_mmap_sem style lock could be used to prevent new + * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing. */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 9de4011c2f4d5b..492ec6754675d5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -149,13 +149,6 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; - /** - * Can't take inode lock in fault path (leads to circular dependency). - * Introduce another semaphore which can be taken in fault path and - * then other filesystem paths can take this to block faults. - */ - struct rw_semaphore i_mmap_sem; - #ifdef CONFIG_FUSE_DAX /* * Dax specific inode data diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 157c443d4c51b7..530b9bfe717448 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); - init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) From 06ad2f7110db18f9bb25d7082603ccc90b5e3787 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Aug 2021 13:22:58 +0200 Subject: [PATCH 008/161] fuse: fix use after free in fuse_read_interrupt() There is a potential race between fuse_read_interrupt() and fuse_request_end(). TASK1 in fuse_read_interrupt(): delete req->intr_entry (while holding fiq->lock) TASK2 in fuse_request_end(): req->intr_entry is empty -> skip fiq->lock wake up TASK3 TASK3 request is freed TASK1 in fuse_read_interrupt(): dereference req->in.h.unique ***BAM*** Fix by always grabbing fiq->lock if the request was ever interrupted (FR_INTERRUPTED set) thereby serializing with concurrent fuse_read_interrupt() calls. FR_INTERRUPTED is set before the request is queued on fiq->interrupts. Dequeing the request is done with list_del_init() but FR_INTERRUPTED is not cleared in this case. Reported-by: lijiazi Signed-off-by: Miklos Szeredi (cherry picked from commit e1e71c168813564be0f6ea3d6740a059ca42d177) (cherry picked from commit fefb72bc7fbb62eef6c09bf4a3dcce9d3418a15e) --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c014a59f27e61f..d6651895185098 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -288,10 +288,10 @@ void fuse_request_end(struct fuse_req *req) /* * test_and_set_bit() implies smp_mb() between bit - * changing and below intr_entry check. Pairs with + * changing and below FR_INTERRUPTED check. Pairs with * smp_mb() from queue_interrupt(). */ - if (!list_empty(&req->intr_entry)) { + if (test_bit(FR_INTERRUPTED, &req->flags)) { spin_lock(&fiq->lock); list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); From d9eb09386ff5db2715de16ce3d29b1be63174102 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 31 Aug 2021 14:18:08 +0200 Subject: [PATCH 009/161] fuse: flush extending writes Callers of fuse_writeback_range() assume that the file is ready for modification by the server in the supplied byte range after the call returns. If there's a write that extends the file beyond the end of the supplied range, then the file needs to be extended to at least the end of the range, but currently that's not done. There are at least two cases where this can cause problems: - copy_file_range() will return short count if the file is not extended up to end of the source range. - FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE will not extend the file, hence the region may not be fully allocated. Fix by flushing writes from the start of the range up to the end of the file. This could be optimized if the writes are non-extending, etc, but it's probably not worth the trouble. Fixes: a2bc92362941 ("fuse: fix copy_file_range() in the writeback case") Fixes: 6b1bdb56b17c ("fuse: allow fallocate(FALLOC_FL_ZERO_RANGE)") Cc: # v5.2 Signed-off-by: Miklos Szeredi (cherry picked from commit 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd) (cherry picked from commit 00445ea2d0e051a6fa92e1a60b7a9abf387e6218) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f3b125afc72037..419fcdefba25e3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2965,7 +2965,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); if (!err) fuse_sync_writes(inode); From 0f48fbc9be1ab02e84a1689a7147da5e950574ee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 1 Sep 2021 12:39:02 +0200 Subject: [PATCH 010/161] fuse: remove unused arg in fuse_write_file_get() The struct fuse_conn argument is not used and can be removed. Signed-off-by: Miklos Szeredi (cherry picked from commit a9667ac88e2b20f6426e09945e9dbf555fb86ff0) (cherry picked from commit 6e78684218b748ef3e3dd6fc53e3f90e0ea62352) --- fs/fuse/file.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 419fcdefba25e3..b8fa891cd7a071 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1871,8 +1871,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_writepage_free(wpa); } -static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { struct fuse_file *ff = NULL; @@ -1887,22 +1886,20 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, return ff; } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = __fuse_write_file_get(fc, fi); + struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; - ff = __fuse_write_file_get(fc, fi); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, false, false); @@ -1966,7 +1963,7 @@ static int fuse_writepage_locked(struct page *page) goto err_free; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fc, fi); + wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) goto err_nofile; @@ -2191,7 +2188,7 @@ static int fuse_writepages_fill(struct page *page, if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, fi); + data->ff = fuse_write_file_get(fi); if (!data->ff) goto out_unlock; } From 97b1a0970fce469ce8e313920c6f4378e5f96ad7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:01 +0200 Subject: [PATCH 011/161] fuse: make sure reclaim doesn't write the inode In writeback cache mode mtime/ctime updates are cached, and flushed to the server using the ->write_inode() callback. Closing the file will result in a dirty inode being immediately written, but in other cases the inode can remain dirty after all references are dropped. This result in the inode being written back from reclaim, which can deadlock on a regular allocation while the request is being served. The usual mechanisms (GFP_NOFS/PF_MEMALLOC*) don't work for FUSE, because serving a request involves unrelated userspace process(es). Instead do the same as for dirty pages: make sure the inode is written before the last reference is gone. - fallocate(2)/copy_file_range(2): these call file_update_time() or file_modified(), so flush the inode before returning from the call - unlink(2), link(2) and rename(2): these call fuse_update_ctime(), so flush the ctime directly from this helper Reported-by: chenguanyou Signed-off-by: Miklos Szeredi (cherry picked from commit 5c791fe1e2a4f401f819065ea4fc0450849f1818) (cherry picked from commit 46a668dfd3560bffd24ffe763b29d25e0a1b351a) --- fs/fuse/dir.c | 8 ++++++++ fs/fuse/file.c | 15 +++++++++++++++ fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 3 +++ 4 files changed, 27 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ee021ec097002a..b4fb994bac14f5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -902,11 +902,19 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, S_IFLNK); } +void fuse_flush_time_update(struct inode *inode) +{ + int err = sync_inode_metadata(inode, 1); + + mapping_set_error(inode->i_mapping, err); +} + void fuse_update_ctime(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); + fuse_flush_time_update(inode); } } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b8fa891cd7a071..2924cae3d72a3e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1899,6 +1899,17 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; + /* + * Inode is always written before the last reference is dropped and + * hence this should not be reached from reclaim. + * + * Writing back the inode from reclaim can deadlock if the request + * processing itself needs an allocation. Allocations triggering + * reclaim while serving a request can't be prevented, because it can + * involve any number of unrelated userspace processes. + */ + WARN_ON(wbc->for_reclaim); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) @@ -3061,6 +3072,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_unlock(inode); + fuse_flush_time_update(inode); + return err; } @@ -3170,6 +3183,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, inode_unlock(inode_out); file_accessed(file_in); + fuse_flush_time_update(inode_out); + return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 492ec6754675d5..a997042801b46f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1157,6 +1157,7 @@ int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); +void fuse_flush_time_update(struct inode *inode); void fuse_update_ctime(struct inode *inode); int fuse_update_attributes(struct inode *inode, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 530b9bfe717448..48466267e55ba5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -118,6 +118,9 @@ static void fuse_evict_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); + /* Will write inode on close/munmap and in all other dirtiers */ + WARN_ON(inode->i_state & I_DIRTY_INODE); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { From 06f6410eac334720c75f4f836f5f04c27e665f18 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:01 +0200 Subject: [PATCH 012/161] fuse: write inode in fuse_vma_close() instead of fuse_release() Fuse ->release() is otherwise asynchronous for the reason that it can happen in contexts unrelated to close/munmap. Inode is already written back from fuse_flush(). Add it to fuse_vma_close() as well to make sure inode dirtying from mmaps also get written out before the file is released. Also add error handling. Signed-off-by: Miklos Szeredi (cherry picked from commit 36ea23374d1f7b6a9d96a2b61d38830fdf23e45d) (cherry picked from commit f451e1b25a2f6abe4478f233d1d045432a15c0a0) --- fs/fuse/file.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2924cae3d72a3e..9e4f4c3a947bc4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -343,12 +343,6 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc = get_fuse_conn(inode); - - /* see fuse_vma_close() for !writeback_cache case */ - if (fc->writeback_cache) - write_inode_now(inode, 1); - fuse_release_common(file, false); /* return value is ignored by VFS */ @@ -2410,12 +2404,15 @@ static int fuse_launder_folio(struct folio *folio) } /* - * Write back dirty pages now, because there may not be any suitable - * open files later + * Write back dirty data/metadata now (there may not be any suitable + * open files later for data) */ static void fuse_vma_close(struct vm_area_struct *vma) { - filemap_write_and_wait(vma->vm_file->f_mapping); + int err; + + err = write_inode_now(vma->vm_file->f_mapping->host, 1); + mapping_set_error(vma->vm_file->f_mapping, err); } /* From 2040539c1ea889dab237ccd187bdbe968ddcda99 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:01 +0200 Subject: [PATCH 013/161] fuse: annotate lock in fuse_reverse_inval_entry() Add missing inode lock annotatation; found by syzbot. Reported-and-tested-by: syzbot+9f747458f5990eaa8d43@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi (cherry picked from commit bda9a71980e083699a0360963c0135657b73f47a) (cherry picked from commit 66cf13d649ac6a9ec96444618fad36bdf541b378) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b4fb994bac14f5..3b5a6b05b4afbb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1243,7 +1243,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock(parent); + inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; From e4589223402e2d2785be3c553c4c5dff2f54bcf1 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Wed, 8 Sep 2021 16:38:28 +0800 Subject: [PATCH 014/161] fuse: use kmap_local_page() Due to the introduction of kmap_local_*, the storage of slots used for short-term mapping has changed from per-CPU to per-thread. kmap_atomic() disable preemption, while kmap_local_*() only disable migration. There is no need to disable preemption in several kamp_atomic places used in fuse. Link: https://lwn.net/Articles/836144/ Signed-off-by: Peng Hao Signed-off-by: Miklos Szeredi (cherry picked from commit 5fe0fc9f1de63de748b87f121c038d39192a271d) (cherry picked from commit 90f0b8ab332ec29c60ecb1ad76291c63dd16dc09) --- fs/fuse/dev.c | 8 ++++---- fs/fuse/ioctl.c | 4 ++-- fs/fuse/readdir.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d6651895185098..97e19b0a27b9ca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -749,7 +749,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { - void *pgaddr = kmap_atomic(cs->pg); + void *pgaddr = kmap_local_page(cs->pg); void *buf = pgaddr + cs->offset; if (cs->write) @@ -757,7 +757,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) else memcpy(*val, buf, ncpy); - kunmap_atomic(pgaddr); + kunmap_local(pgaddr); *val += ncpy; } *size -= ncpy; @@ -954,10 +954,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, } } if (page) { - void *mapaddr = kmap_atomic(page); + void *mapaddr = kmap_local_page(page); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); - kunmap_atomic(mapaddr); + kunmap_local(mapaddr); } else offset += fuse_copy_do(cs, NULL, &count); } diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index d181e094060173..cfa0ecf844297f 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -304,11 +304,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_atomic(ap.pages[0]); + vaddr = kmap_local_page(ap.pages[0]); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); - kunmap_atomic(vaddr); + kunmap_local(vaddr); if (err) goto out; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index d5294e663df501..c40ec53f425381 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -76,13 +76,13 @@ static void fuse_add_dirent_to_cache(struct file *file, WARN_ON(fi->rdc.pos != pos)) goto unlock; - addr = kmap_atomic(page); + addr = kmap_local_page(page); if (!offset) { clear_page(addr); SetPageUptodate(page); } memcpy(addr + offset, dirent, reclen); - kunmap_atomic(addr); + kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; fi->rdc.pos = dirent->off; unlock: From a61d7306039df65d2c04457b254d97d59a5e1867 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Mon, 6 Sep 2021 20:31:48 +0800 Subject: [PATCH 015/161] fuse: delete redundant code 'ia->io=io' has been set in fuse_io_alloc. Signed-off-by: Peng Hao Signed-off-by: Miklos Szeredi (cherry picked from commit b5d9758297858288f1d8cd9b24a4e2f899f169e0) (cherry picked from commit 051005294ea44eda0155be25f22ca598d5bbbb99) --- fs/fuse/file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9e4f4c3a947bc4..e7e75d102d868d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1460,7 +1460,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); From 5074a3da4ab413950623552b51b00f3cb938d1e1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:01 +0200 Subject: [PATCH 016/161] fuse: move fuse_invalidate_attr() into fuse_update_ctime() Logically it belongs there since attributes are invalidated due to the updated ctime. This is a cleanup and should not change behavior. Signed-off-by: Miklos Szeredi (cherry picked from commit 371e8fd02969383204b1f6023451125dbc20dfbd) (cherry picked from commit dba6ae45c94262c57d8ae652c660e400522a61e7) --- fs/fuse/dir.c | 9 ++------- fs/fuse/xattr.c | 10 ++++------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3b5a6b05b4afbb..53756181dc5097 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -911,6 +911,7 @@ void fuse_flush_time_update(struct inode *inode) void fuse_update_ctime(struct inode *inode) { + fuse_invalidate_attr(inode); if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); @@ -948,7 +949,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) if (inode->i_nlink > 0) drop_nlink(inode); spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); @@ -1005,13 +1005,10 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, err = fuse_simple_request(fm, &args); if (!err) { /* ctime changes */ - fuse_invalidate_attr(d_inode(oldent)); fuse_update_ctime(d_inode(oldent)); - if (flags & RENAME_EXCHANGE) { - fuse_invalidate_attr(d_inode(newent)); + if (flags & RENAME_EXCHANGE) fuse_update_ctime(d_inode(newent)); - } fuse_dir_changed(olddir); if (olddir != newdir) @@ -1019,7 +1016,6 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { - fuse_invalidate_attr(d_inode(newent)); fuse_invalidate_entry_cache(newent); fuse_update_ctime(d_inode(newent)); } @@ -1102,7 +1098,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, if (likely(inode->i_nlink < UINT_MAX)) inc_nlink(inode); spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 61dfaf7b7d20c0..0d3e7177fce0a7 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -42,10 +42,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) { - fuse_invalidate_attr(inode); + if (!err) fuse_update_ctime(inode); - } + return err; } @@ -173,10 +172,9 @@ int fuse_removexattr(struct inode *inode, const char *name) fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) { - fuse_invalidate_attr(inode); + if (!err) fuse_update_ctime(inode); - } + return err; } From b9b6ab06ad8f9f097d69daec9efab327a816882e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 017/161] fuse: simplify __fuse_write_file_get() Use list_first_entry_or_null() instead of list_empty() + list_entry(). Signed-off-by: Miklos Szeredi (cherry picked from commit 84840efc3c0f225ee5597e0a013e9da03afc73c6) (cherry picked from commit 5d488846297ce8c9ebbc6d66477121568f5d371f) --- fs/fuse/file.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e7e75d102d868d..04e7e2a1f42f91 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1866,14 +1866,13 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = NULL; + struct fuse_file *ff; spin_lock(&fi->lock); - if (!list_empty(&fi->write_files)) { - ff = list_entry(fi->write_files.next, struct fuse_file, - write_entry); + ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, + write_entry); + if (ff) fuse_file_get(ff); - } spin_unlock(&fi->lock); return ff; From 2fc0ccf0d883ed69e7ef300b678aea714b3189ec Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 018/161] fuse: decrement nlink on overwriting rename Rename didn't decrement/clear nlink on overwritten target inode. Create a common helper fuse_entry_unlinked() that handles this for unlink, rmdir and rename. Signed-off-by: Miklos Szeredi (cherry picked from commit cefd1b83275d4c587bdeb2fe7aed07908642f875) (cherry picked from commit 02342953222f1989fcb8c4e96db89131cf34b2fd) --- fs/fuse/dir.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 53756181dc5097..918cbcf8987072 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -919,6 +919,29 @@ void fuse_update_ctime(struct inode *inode) } } +static void fuse_entry_unlinked(struct dentry *entry) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&fi->lock); + fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -935,23 +958,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { - struct inode *inode = d_inode(entry); - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); - /* - * If i_nlink == 0 then unlink doesn't make sense, yet this can - * happen if userspace filesystem is careless. It would be - * difficult to enforce correct nlink usage so just ignore this - * condition here - */ - if (inode->i_nlink > 0) - drop_nlink(inode); - spin_unlock(&fi->lock); fuse_dir_changed(dir); - fuse_invalidate_entry_cache(entry); - fuse_update_ctime(inode); + fuse_entry_unlinked(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -973,9 +981,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { - clear_nlink(d_inode(entry)); fuse_dir_changed(dir); - fuse_invalidate_entry_cache(entry); + fuse_entry_unlinked(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -1015,10 +1022,8 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_dir_changed(newdir); /* newent will end up negative */ - if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { - fuse_invalidate_entry_cache(newent); - fuse_update_ctime(d_inode(newent)); - } + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) + fuse_entry_unlinked(newent); } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation From dc95694846fd4899afaf5eb3ff5d95453a5c7e17 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 019/161] fuse: don't increment nlink in link() The fuse_iget() call in create_new_entry() already updated the inode with all the new attributes and incremented the attribute version. Incrementing the nlink will result in the wrong count. This wasn't noticed because the attributes were invalidated right after this. Updating ctime is still needed for the writeback case when the ctime is not refreshed. Signed-off-by: Miklos Szeredi (cherry picked from commit 97f044f690bac2b094bfb7fb2d177ef946c85880) (cherry picked from commit 0ac01b56904844c9fc0fd152b11ba058df77ded0) --- fs/fuse/dir.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 918cbcf8987072..45ebd0179e4c1e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -909,9 +909,8 @@ void fuse_flush_time_update(struct inode *inode) mapping_set_error(inode->i_mapping, err); } -void fuse_update_ctime(struct inode *inode) +static void fuse_update_ctime_in_cache(struct inode *inode) { - fuse_invalidate_attr(inode); if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); @@ -919,6 +918,12 @@ void fuse_update_ctime(struct inode *inode) } } +void fuse_update_ctime(struct inode *inode) +{ + fuse_invalidate_attr(inode); + fuse_update_ctime_in_cache(inode); +} + static void fuse_entry_unlinked(struct dentry *entry) { struct inode *inode = d_inode(entry); @@ -1089,24 +1094,11 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); - /* Contrary to "normal" filesystems it can happen that link - makes two "logical" inodes point to the same "physical" - inode. We invalidate the attributes of the old one, so it - will reflect changes in the backing inode (link count, - etc.) - */ - if (!err) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); - if (likely(inode->i_nlink < UINT_MAX)) - inc_nlink(inode); - spin_unlock(&fi->lock); - fuse_update_ctime(inode); - } else if (err == -EINTR) { + if (!err) + fuse_update_ctime_in_cache(inode); + else if (err == -EINTR) fuse_invalidate_attr(inode); - } + return err; } From 87b01363aba96994a2a34bf9346a19024f1f94eb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 020/161] fuse: selective attribute invalidation Only invalidate attributes that the operation might have changed. Introduce two constants for common combinations of changed attributes: FUSE_STATX_MODIFY: file contents are modified but not size FUSE_STATX_MODSIZE: size and/or file contents modified Signed-off-by: Miklos Szeredi (cherry picked from commit fa5eee57e33e79b71b40e6950c29cc46f5cc5cb7) (cherry picked from commit 5f061d1664bcd70884e2cf318718e58bf3d04a73) --- fs/fuse/dax.c | 2 +- fs/fuse/dir.c | 4 ++-- fs/fuse/file.c | 16 ++++++++-------- fs/fuse/fuse_i.h | 8 ++++++++ 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 7d29b8817f1926..ae3f62f2b4fafe 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -735,7 +735,7 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) return ret; - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); fuse_write_update_size(inode, iocb->ki_pos); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 45ebd0179e4c1e..093f08909d9bcf 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -119,7 +119,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) { set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); } @@ -920,7 +920,7 @@ static void fuse_update_ctime_in_cache(struct inode *inode) void fuse_update_ctime(struct inode *inode) { - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_CTIME); fuse_update_ctime_in_cache(inode); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 04e7e2a1f42f91..ff57bd6aa1106e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -210,7 +210,7 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); if (fc->writeback_cache) file_update_time(file); } @@ -522,7 +522,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) * enabled, i_blocks from cached attr may not be accurate. */ if (!err && fm->fc->writeback_cache) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_BLOCKS); return err; } @@ -1282,7 +1282,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) fuse_write_update_size(inode, pos); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); if (!res) return err; @@ -1604,7 +1604,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) FUSE_DIO_WRITE); } } - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); if (exclusive_lock) @@ -1820,7 +1820,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, * is enabled, we trust local ctime/mtime. */ if (!fc->writeback_cache) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { @@ -2934,7 +2934,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } else { ret = __fuse_direct_read(io, iter, &pos); } @@ -3056,7 +3056,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); out: if (!(mode & FALLOC_FL_KEEP_SIZE)) @@ -3168,7 +3168,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, file_update_time(file_out); } - fuse_invalidate_attr(inode_out); + fuse_invalidate_attr_mask(inode_out, FUSE_STATX_MODSIZE); err = outarg.size; out: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a997042801b46f..81bd1991e6cb00 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1074,7 +1074,15 @@ void fuse_wait_aborted(struct fuse_conn *fc); /** * Invalidate inode attributes */ + +/* Attributes possibly changed on data modification */ +#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS) + +/* Attributes possibly changed on data and/or size modification */ +#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE) + void fuse_invalidate_attr(struct inode *inode); +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask); void fuse_invalidate_entry_cache(struct dentry *entry); From 5de8b5edcf37cdb1a9b851cd8d2ed95702964ed2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 021/161] fuse: don't bump attr_version in cached write The attribute version in fuse_inode should be updated whenever the attributes might have changed on the server. In case of cached writes this is not the case, so updating the attr_version is unnecessary and could possibly affect performance. Open code the remaining part of fuse_write_update_size(). Signed-off-by: Miklos Szeredi (cherry picked from commit 8c56e03d2e08d83776c89e4b6563ca8cfdf7da54) (cherry picked from commit 5b3e1e37d5ea324a2b07ec4250cc30097ac6e45f) --- fs/fuse/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ff57bd6aa1106e..5e372fa3fa09db 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2368,15 +2368,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, if (!copied) goto unlock; + pos += copied; if (!PageUptodate(page)) { /* Zero any unwritten bytes at the end of the page */ - size_t endoff = (pos + copied) & ~PAGE_MASK; + size_t endoff = pos & ~PAGE_MASK; if (endoff) zero_user_segment(page, endoff, PAGE_SIZE); SetPageUptodate(page); } - fuse_write_update_size(inode, pos + copied); + if (pos > inode->i_size) + i_size_write(inode, pos); + set_page_dirty(page); unlock: From d069161382f0811a7625cfaa3e5f1f51a7e7fab8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 022/161] fuse: rename fuse_write_update_size() This function already updates the attr_version in fuse_inode, regardless of whether the size was changed or not. Rename the helper to fuse_write_update_attr() to reflect the more generic nature. Signed-off-by: Miklos Szeredi (cherry picked from commit 27ae449ba26eb6c1cd217fa28339841c55bc79e1) (cherry picked from commit 3480f8757c334a48525e43458f4eac173f50155d) --- fs/fuse/dax.c | 2 +- fs/fuse/dev.c | 2 +- fs/fuse/file.c | 12 ++++++------ fs/fuse/fuse_i.h | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index ae3f62f2b4fafe..8f0c5174c2e6ba 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -736,7 +736,7 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) return ret; fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - fuse_write_update_size(inode, iocb->ki_pos); + fuse_write_update_attr(inode, iocb->ki_pos); return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97e19b0a27b9ca..a7eb43e26d20c5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1596,7 +1596,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, end = outarg.offset + outarg.size; if (end > file_size) { file_size = end; - fuse_write_update_size(inode, file_size); + fuse_write_update_attr(inode, file_size); } num = outarg.size; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5e372fa3fa09db..c2b7e2dc62b006 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1082,7 +1082,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, return err ?: ia->write.out.size; } -bool fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_attr(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1279,7 +1279,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) } while (!err && iov_iter_count(ii)); if (res > 0) - fuse_write_update_size(inode, pos); + fuse_write_update_attr(inode, pos); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); @@ -1606,7 +1606,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) } fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); if (res > 0) - fuse_write_update_size(inode, iocb->ki_pos); + fuse_write_update_attr(inode, iocb->ki_pos); if (exclusive_lock) inode_unlock(inode); else @@ -2960,7 +2960,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { if (ret > 0) - fuse_write_update_size(inode, pos); + fuse_write_update_attr(inode, pos); /* For extending writes we already hold exclusive lock */ else if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); @@ -3050,7 +3050,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { - bool changed = fuse_write_update_size(inode, offset + length); + bool changed = fuse_write_update_attr(inode, offset + length); if (changed && fm->fc->writeback_cache) file_update_time(file); @@ -3167,7 +3167,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); if (fc->writeback_cache) { - fuse_write_update_size(inode_out, pos_out + outarg.size); + fuse_write_update_attr(inode_out, pos_out + outarg.size); file_update_time(file_out); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 81bd1991e6cb00..7c2a6db29ffaf6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1226,7 +1226,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, __poll_t fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -bool fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_attr(struct inode *inode, loff_t pos); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); From 3df70f4f8b33e3431613c79aa5d2b87f9aebda1f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:02 +0200 Subject: [PATCH 023/161] fuse: always invalidate attributes after writes Extend the fuse_write_update_attr() helper to invalidate cached attributes after a write. This has already been done in all cases except in fuse_notify_store(), so this is mostly a cleanup. fuse_direct_write_iter() calls fuse_direct_IO() which already calls fuse_write_update_attr(), so don't repeat that again in the former. Signed-off-by: Miklos Szeredi (cherry picked from commit d347739a0e760e9f370aa021da3feacc37d3e511) (cherry picked from commit 4905303042e2aa18dcebc451e8ac8a7ac6b3bcea) --- fs/fuse/dax.c | 5 +---- fs/fuse/dev.c | 2 +- fs/fuse/file.c | 26 ++++++++++++-------------- fs/fuse/fuse_i.h | 2 +- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8f0c5174c2e6ba..13fc6bc952e486 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -732,11 +732,8 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); - if (ret < 0) - return ret; - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - fuse_write_update_attr(inode, iocb->ki_pos); + fuse_write_update_attr(inode, iocb->ki_pos, ret); return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a7eb43e26d20c5..da944b1c93c1bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1596,7 +1596,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, end = outarg.offset + outarg.size; if (end > file_size) { file_size = end; - fuse_write_update_attr(inode, file_size); + fuse_write_update_attr(inode, file_size, outarg.size); } num = outarg.size; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c2b7e2dc62b006..de3cb9bd7aa701 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1082,7 +1082,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, return err ?: ia->write.out.size; } -bool fuse_write_update_attr(struct inode *inode, loff_t pos) +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1090,12 +1090,14 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos) spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); - if (pos > inode->i_size) { + if (written > 0 && pos > inode->i_size) { i_size_write(inode, pos); ret = true; } spin_unlock(&fi->lock); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + return ret; } @@ -1278,11 +1280,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) kfree(ap->pages); } while (!err && iov_iter_count(ii)); - if (res > 0) - fuse_write_update_attr(inode, pos); - + fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); if (!res) return err; @@ -1602,11 +1601,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) } else { res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + fuse_write_update_attr(inode, iocb->ki_pos, res); } } - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - if (res > 0) - fuse_write_update_attr(inode, iocb->ki_pos); if (exclusive_lock) inode_unlock(inode); else @@ -2959,10 +2956,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { - if (ret > 0) - fuse_write_update_attr(inode, pos); + fuse_write_update_attr(inode, pos, ret); /* For extending writes we already hold exclusive lock */ - else if (ret < 0 && offset + count > i_size) + if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } @@ -3050,7 +3046,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { - bool changed = fuse_write_update_attr(inode, offset + length); + bool changed = fuse_write_update_attr(inode, offset + length, + length); if (changed && fm->fc->writeback_cache) file_update_time(file); @@ -3167,7 +3164,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); if (fc->writeback_cache) { - fuse_write_update_attr(inode_out, pos_out + outarg.size); + fuse_write_update_attr(inode_out, pos_out + outarg.size, + outarg.size); file_update_time(file_out); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7c2a6db29ffaf6..fbb63d7a076035 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1226,7 +1226,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, __poll_t fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -bool fuse_write_update_attr(struct inode *inode, loff_t pos); +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); From 31f2f131e77a80a04200b667552dd1101ee65829 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 024/161] fuse: fix attr version comparison in fuse_read_update_size() A READ request returning a short count is taken as indication of EOF, and the cached file size is modified accordingly. Fix the attribute version checking to allow for changes to fc->attr_version on other inodes. Signed-off-by: Miklos Szeredi (cherry picked from commit 484ce65715b06aead8c4901f01ca32c5a240bc71) (cherry picked from commit 9a48447c28252adbd14bb38e6542a7e29aba1c70) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index de3cb9bd7aa701..2a5865a6336e33 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -794,7 +794,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - if (attr_ver == fi->attr_version && size < inode->i_size && + if (attr_ver >= fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); From 0c20f8291b27036372cc892afef50762786e8f7c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 025/161] fuse: cleanup code conditional on fc->writeback_cache It's safe to call file_update_time() if writeback cache is not enabled, since S_NOCMTIME is set in this case. This part is purely a cleanup. __fuse_copy_file_range() also calls fuse_write_update_attr() only in the writeback cache case. This is inconsistent with other callers, where it's called unconditionally. Signed-off-by: Miklos Szeredi (cherry picked from commit 20235b435a5c8897e46d094454408b6ab7157dbd) (cherry picked from commit f2abe85368454f98f424990616072df595610f74) --- fs/fuse/file.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2a5865a6336e33..73d2dfc1106092 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -210,9 +210,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); + file_update_time(file); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - if (fc->writeback_cache) - file_update_time(file); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -3046,10 +3045,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { - bool changed = fuse_write_update_attr(inode, offset + length, - length); - - if (changed && fm->fc->writeback_cache) + if (fuse_write_update_attr(inode, offset + length, length)) file_update_time(file); } @@ -3163,13 +3159,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, ALIGN_DOWN(pos_out, PAGE_SIZE), ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); - if (fc->writeback_cache) { - fuse_write_update_attr(inode_out, pos_out + outarg.size, - outarg.size); - file_update_time(file_out); - } - - fuse_invalidate_attr_mask(inode_out, FUSE_STATX_MODSIZE); + file_update_time(file_out); + fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); err = outarg.size; out: From 1005dee21ac7808e3b56af5d438cb5461b89c426 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 026/161] fuse: simplify local variables holding writeback cache state There are two instances of "bool is_wb = fc->writeback_cache" where the actual use mostly involves checking "is_wb && S_ISREG(inode->i_mode)". Clean up these cases by storing the second condition in the local variable. Signed-off-by: Miklos Szeredi (cherry picked from commit c15016b7ae1caf77f80ae87a71745368ef651ba6) (cherry picked from commit 8712ae1d399a2cb886cde412276d6b1fa1e7cbda) --- fs/fuse/dir.c | 8 ++++---- fs/fuse/inode.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 093f08909d9bcf..fe96ded24d99ec 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1727,10 +1727,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; - bool is_wb = fc->writeback_cache; + bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); loff_t oldsize; int err; - bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb; bool fault_blocked = false; if (!fc->default_permissions) @@ -1774,7 +1774,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } /* Flush dirty data/metadata before non-truncate SETATTR */ - if (is_wb && S_ISREG(inode->i_mode) && + if (is_wb && attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | ATTR_TIMES_SET)) { @@ -1845,7 +1845,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, attr_timeout(&outarg)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ - if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + if (!is_wb || is_truncate) i_size_write(inode, outarg.attr.size); if (is_truncate) { diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 48466267e55ba5..0f29a84b783019 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -229,7 +229,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - bool is_wb = fc->writeback_cache; + bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); loff_t oldsize; struct timespec64 old_mtime; @@ -249,7 +249,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ - if (!is_wb || !S_ISREG(inode->i_mode)) + if (!is_wb) i_size_write(inode, attr->size); spin_unlock(&fi->lock); From d75037d0ad78ec0e5e5c24caff7b9268dd91d9f0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 027/161] fuse: move reverting attributes to fuse_change_attributes() In case of writeback_cache fuse_fillattr() would revert the queried attributes to the cached version. Move this to fuse_change_attributes() in order to manage the writeback logic in a central helper. This will be necessary for patches that follow. Only fuse_do_getattr() -> fuse_fillattr() uses the attributes after calling fuse_change_attributes(), so this should not change behavior. Signed-off-by: Miklos Szeredi (cherry picked from commit 04d82db0c557e074a5d898b43de81fe659b9cc5a) (cherry picked from commit 48562071ab3aec6be43bc30bcb1d18b7e8c1d019) --- fs/fuse/dir.c | 9 --------- fs/fuse/inode.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index fe96ded24d99ec..e7245f7d68cf8f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1108,15 +1108,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); - /* see the comment in fuse_change_attributes() */ - if (fc->writeback_cache && S_ISREG(inode->i_mode)) { - attr->size = i_size_read(inode); - attr->mtime = inode->i_mtime.tv_sec; - attr->mtimensec = inode->i_mtime.tv_nsec; - attr->ctime = inode->i_ctime.tv_sec; - attr->ctimensec = inode->i_ctime.tv_nsec; - } - stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0f29a84b783019..ebf4aef9e613a9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -234,6 +234,19 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, struct timespec64 old_mtime; spin_lock(&fi->lock); + /* + * In case of writeback_cache enabled, writes update mtime, ctime and + * may update i_size. In these cases trust the cached value in the + * inode. + */ + if (is_wb) { + attr->size = i_size_read(inode); + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } + if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { spin_unlock(&fi->lock); From b474dbfe06818afffee3e4eb6bf8d8126403d15e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 028/161] fuse: add cache_mask If writeback_cache is enabled, then the size, mtime and ctime attributes of regular files are always valid in the kernel's cache. They are retrieved from userspace only when the inode is freshly looked up. Add a more generic "cache_mask", that indicates which attributes are currently valid in cache. This patch doesn't change behavior. Signed-off-by: Miklos Szeredi (cherry picked from commit 4b52f059b5ddbb364d35f2bcc3d267a009078db7) (cherry picked from commit 10c2fe3f42cd7e6bc636b5d4e04d7b7c167817c6) --- fs/fuse/dir.c | 3 ++- fs/fuse/fuse_i.h | 4 +++- fs/fuse/inode.c | 31 ++++++++++++++++++++++++------- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e7245f7d68cf8f..c93fa1b83536d8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1833,7 +1833,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg)); + attr_timeout(&outarg), + fuse_get_cache_mask(inode)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fbb63d7a076035..551bdae6568d04 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1040,7 +1040,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid); + u64 attr_valid, u32 cache_mask); + +u32 fuse_get_cache_mask(struct inode *inode); /** * Initialize the client device diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ebf4aef9e613a9..0335bb4e0432e9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -164,7 +164,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid) + u64 attr_valid, u32 cache_mask) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -190,9 +190,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + if (!(cache_mask & STATX_MTIME)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + } + if (!(cache_mask & STATX_CTIME)) { inode->i_ctime.tv_sec = attr->ctime; inode->i_ctime.tv_nsec = attr->ctimensec; } @@ -224,12 +226,22 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_flags &= ~S_NOSEC; } +u32 fuse_get_cache_mask(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + return 0; + + return STATX_MTIME | STATX_CTIME | STATX_SIZE; +} + void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); + u32 cache_mask; loff_t oldsize; struct timespec64 old_mtime; @@ -239,10 +251,15 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * may update i_size. In these cases trust the cached value in the * inode. */ - if (is_wb) { + cache_mask = fuse_get_cache_mask(inode); + if (cache_mask & STATX_SIZE) attr->size = i_size_read(inode); + + if (cache_mask & STATX_MTIME) { attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + } + if (cache_mask & STATX_CTIME) { attr->ctime = inode->i_ctime.tv_sec; attr->ctimensec = inode->i_ctime.tv_nsec; } @@ -254,7 +271,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid); + fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); oldsize = inode->i_size; /* @@ -262,11 +279,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ - if (!is_wb) + if (!(cache_mask & STATX_SIZE)) i_size_write(inode, attr->size); spin_unlock(&fi->lock); - if (!is_wb && S_ISREG(inode->i_mode)) { + if (!cache_mask && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { From f6da76ec9e2a3c03a7c2e08bdf793ef1b1959e97 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 029/161] fuse: take cache_mask into account in getattr When deciding to send a GETATTR request take into account the cache mask (which attributes are always valid). The cache mask takes precedence over the invalid mask. This results in the GETATTR request not being sent unnecessarily. Signed-off-by: Miklos Szeredi (cherry picked from commit ec85537519b330a0deb8fe742fd1b0efc40a1710) (cherry picked from commit 9da4e8f4496a4677d1ef25fc46cae188b3c81c90) --- fs/fuse/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c93fa1b83536d8..b2ac3fe9808185 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1185,12 +1185,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; bool sync; + u32 inval_mask = READ_ONCE(fi->inval_mask); + u32 cache_mask = fuse_get_cache_mask(inode); if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; - else if (request_mask & READ_ONCE(fi->inval_mask)) + else if (request_mask & inval_mask & ~cache_mask) sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); From 079262adb6ce8ecaa27b77af695dae9a8deb05d4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Oct 2021 17:03:03 +0200 Subject: [PATCH 030/161] fuse: only update necessary attributes fuse_update_attributes() refreshes metadata for internal use. Each use needs a particular set of attributes to be refreshed, but currently that cannot be expressed and all but atime are refreshed. Add a mask argument, which lets fuse_update_get_attr() to decide based on the cache_mask and the inval_mask whether a GETATTR call is needed or not. Reported-by: Yongji Xie Signed-off-by: Miklos Szeredi (cherry picked from commit c6c745b81033a4c1f0e5f3b16398a10f2d000c29) (cherry picked from commit 4e2c49dad23d3ea4dc0876a5879d4f40521db9b6) --- fs/fuse/dir.c | 6 ++---- fs/fuse/file.c | 9 +++++---- fs/fuse/fuse_i.h | 2 +- fs/fuse/readdir.c | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b2ac3fe9808185..095ea5458c8e56 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1209,11 +1209,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, return err; } -int fuse_update_attributes(struct inode *inode, struct file *file) +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - /* Do *not* need to get atime for internal purposes */ - return fuse_update_get_attr(inode, file, NULL, - STATX_BASIC_STATS & ~STATX_ATIME, 0); + return fuse_update_get_attr(inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 73d2dfc1106092..0e3d516e1ca6ec 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1012,7 +1012,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, iocb->ki_filp); + err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); if (err) return err; } @@ -1300,7 +1300,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, file); + err = fuse_update_attributes(mapping->host, file, + STATX_SIZE | STATX_MODE); if (err) return err; @@ -2692,7 +2693,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, file); + err = fuse_update_attributes(inode, file, STATX_SIZE); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2712,7 +2713,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, file); + retval = fuse_update_attributes(inode, file, STATX_SIZE); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 551bdae6568d04..28db2f9eb4b721 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1170,7 +1170,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); void fuse_flush_time_update(struct inode *inode); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct file *file); +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask); void fuse_flush_writepages(struct inode *inode); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index c40ec53f425381..e8deaacf1832a1 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -456,7 +456,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) * cache; both cases require an up-to-date mtime value. */ if (!ctx->pos && fc->auto_inval_data) { - int err = fuse_update_attributes(inode, file); + int err = fuse_update_attributes(inode, file, STATX_MTIME); if (err) return err; From 679e5684931ef089a0dbe10cef3f8dde159a8ea6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 2 Nov 2021 11:10:37 +0100 Subject: [PATCH 031/161] fuse: fix page stealing It is possible to trigger a crash by splicing anon pipe bufs to the fuse device. The reason for this is that anon_pipe_buf_release() will reuse buf->page if the refcount is 1, but that page might have already been stolen and its flags modified (e.g. PG_lru added). This happens in the unlikely case of fuse_dev_splice_write() getting around to calling pipe_buf_release() after a page has been stolen, added to the page cache and removed from the page cache. Fix by calling pipe_buf_release() right after the page was inserted into the page cache. In this case the page has an elevated refcount so any release function will know that the page isn't reusable. Reported-by: Frank Dinoff Link: https://lore.kernel.org/r/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1yw@mail.gmail.com/ Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") Cc: # v2.6.35 Signed-off-by: Miklos Szeredi (cherry picked from commit 712a951025c0667ff00b25afc360f74e639dfabe) (cherry picked from commit 31b7e735ab175f0dae90fc5dcad1513d9a7d8ffd) --- fs/fuse/dev.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index da944b1c93c1bb..38623e13d172a3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -842,6 +842,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) replace_page_cache_folio(oldfolio, newfolio); + /* + * Release while we have extra ref on stolen page. Otherwise + * anon_pipe_buf_release() might think the page can be reused. + */ + pipe_buf_release(cs->pipe, buf); + folio_get(newfolio); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) @@ -2038,8 +2044,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); out_free: - for (idx = 0; idx < nbuf; idx++) - pipe_buf_release(pipe, &bufs[idx]); + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + + if (buf->ops) + pipe_buf_release(pipe, buf); + } pipe_unlock(pipe); kvfree(bufs); From c18f81f2d6b59767642ad88800d886adbf6c1216 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 25 Nov 2021 14:05:18 +0100 Subject: [PATCH 032/161] fuse: release pipe buf after last use Checking buf->flags should be done before the pipe_buf_release() is called on the pipe buffer, since releasing the buffer might modify the flags. This is exactly what page_cache_pipe_buf_release() does, and which results in the same VM_BUG_ON_PAGE(PageLRU(page)) that the original patch was trying to fix. Reported-by: Justin Forbes Fixes: 712a951025c0 ("fuse: fix page stealing") Cc: # v2.6.35 Signed-off-by: Miklos Szeredi (cherry picked from commit 473441720c8616dfaf4451f9c7ea14f0eb5e5d65) (cherry picked from commit 0ebc0c83eb7b90da9607b8c55f32905b51f15861) --- fs/fuse/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 38623e13d172a3..8bdb09a221b58a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -842,17 +842,17 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) replace_page_cache_folio(oldfolio, newfolio); + folio_get(newfolio); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + folio_add_lru(newfolio); + /* * Release while we have extra ref on stolen page. Otherwise * anon_pipe_buf_release() might think the page can be reused. */ pipe_buf_release(cs->pipe, buf); - folio_get(newfolio); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - folio_add_lru(newfolio); - err = 0; spin_lock(&cs->req->waitq.lock); if (test_bit(FR_ABORTED, &cs->req->flags)) From aef5362ee503ae3e29e29a23987736ff74a2a6ca Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 22 Nov 2021 17:05:31 +0800 Subject: [PATCH 033/161] fuse: Pass correct lend value to filemap_write_and_wait_range() The acceptable maximum value of lend parameter in filemap_write_and_wait_range() is LLONG_MAX rather than -1. And there is also some logic depending on LLONG_MAX check in write_cache_pages(). So let's pass LLONG_MAX to filemap_write_and_wait_range() in fuse_writeback_range() instead. Fixes: 59bda8ecee2f ("fuse: flush extending writes") Signed-off-by: Xie Yongji Cc: # v5.15 Signed-off-by: Miklos Szeredi (cherry picked from commit e388164ea385f04666c4633f5dc4f951fca71890) (cherry picked from commit ead6ce34d70856b0e191ddae882370b1e33e860f) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0e3d516e1ca6ec..97ced8bddf59a3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2967,7 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); From 11526029809aa6f7349db2e1bb0e8287c01f65bd Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:24 +0800 Subject: [PATCH 034/161] fuse: add fuse_should_enable_dax() helper This is in prep for following per inode DAX checking. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit cecd491641c23f3c63958a62efb74cdaf3c93d7b) (cherry picked from commit 68b258d66fbd4e22d32dba22b5f8cd46b34e69cc) --- fs/fuse/dax.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 13fc6bc952e486..7ac070a48f712b 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1326,11 +1326,19 @@ static const struct address_space_operations fuse_dax_file_aops = { .dirty_folio = noop_dirty_folio, }; -void fuse_dax_inode_init(struct inode *inode) +static bool fuse_should_enable_dax(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); if (!fc->dax) + return false; + + return true; +} + +void fuse_dax_inode_init(struct inode *inode) +{ + if (!fuse_should_enable_dax(inode)) return; inode->i_flags |= S_DAX; From 629d569f56820bf3e3f07add6aaac7f0d61f68af Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:25 +0800 Subject: [PATCH 035/161] fuse: make DAX mount option a tri-state We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. The following behavior is consistent with that on ext4/xfs: - The default behavior (when neither '-o dax' nor '-o dax=always|never|inode' option is specified) is equal to 'inode' mode, while 'dax=inode' won't be printed among the mount option list. - The 'inode' mode is only advisory. It will silently fallback to 'never' mode if fuse server doesn't support that. Also noted that by the time of this commit, 'inode' mode is actually equal to 'always' mode, before the per inode DAX flag is introduced in the following patch. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit 780b1b959f9bd959e1aca450e9fee0e2c00b31ad) (cherry picked from commit bf9214a3dd230c6978ae93315f2d934e56fe90bd) --- fs/fuse/dax.c | 13 ++++++++++++- fs/fuse/fuse_i.h | 20 ++++++++++++++++++-- fs/fuse/inode.c | 10 +++++++--- fs/fuse/virtio_fs.c | 18 +++++++++++++++--- 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 7ac070a48f712b..7d7e4fdbff5ec7 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1279,11 +1279,14 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) return ret; } -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, + struct dax_device *dax_dev) { struct fuse_conn_dax *fcd; int err; + fc->dax_mode = dax_mode; + if (!dax_dev) return 0; @@ -1329,7 +1332,15 @@ static const struct address_space_operations fuse_dax_file_aops = { static bool fuse_should_enable_dax(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); + enum fuse_dax_mode dax_mode = fc->dax_mode; + + if (dax_mode == FUSE_DAX_NEVER) + return false; + /* + * fc->dax may be NULL in 'inode' mode when filesystem device doesn't + * support DAX, in which case it will silently fallback to 'never' mode. + */ if (!fc->dax) return false; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 28db2f9eb4b721..336606fb95c63f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -483,6 +483,18 @@ struct fuse_dev { struct list_head entry; }; +enum fuse_dax_mode { + FUSE_DAX_INODE_DEFAULT, /* default */ + FUSE_DAX_ALWAYS, /* "-o dax=always" */ + FUSE_DAX_NEVER, /* "-o dax=never" */ + FUSE_DAX_INODE_USER, /* "-o dax=inode" */ +}; + +static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode) +{ + return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER; +} + struct fuse_fs_context { int fd; struct file *file; @@ -500,7 +512,7 @@ struct fuse_fs_context { bool no_control:1; bool no_force_umount:1; bool legacy_opts_show:1; - bool dax:1; + enum fuse_dax_mode dax_mode; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -811,6 +823,9 @@ struct fuse_conn { struct list_head devices; #ifdef CONFIG_FUSE_DAX + /* Dax mode */ + enum fuse_dax_mode dax_mode; + /* Dax specific conn data, non-NULL if DAX is enabled */ struct fuse_conn_dax *dax; #endif @@ -1278,7 +1293,8 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, + struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); void fuse_dax_inode_init(struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0335bb4e0432e9..0bfbd8865e7468 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -773,8 +773,12 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",blksize=%lu", sb->s_blocksize); } #ifdef CONFIG_FUSE_DAX - if (fc->dax) - seq_puts(m, ",dax"); + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); #endif return 0; @@ -1532,7 +1536,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; if (IS_ENABLED(CONFIG_FUSE_DAX)) { - err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); if (err) goto err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c485d0102e8aac..4d8d4f16c727b9 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -89,12 +89,21 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +static const struct constant_table dax_param_enums[] = { + {"always", FUSE_DAX_ALWAYS }, + {"never", FUSE_DAX_NEVER }, + {"inode", FUSE_DAX_INODE_USER }, + {} +}; + enum { OPT_DAX, + OPT_DAX_ENUM, }; static const struct fs_parameter_spec virtio_fs_parameters[] = { fsparam_flag("dax", OPT_DAX), + fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), {} }; @@ -111,7 +120,10 @@ static int virtio_fs_parse_param(struct fs_context *fsc, switch (opt) { case OPT_DAX: - ctx->dax = 1; + ctx->dax_mode = FUSE_DAX_ALWAYS; + break; + case OPT_DAX_ENUM: + ctx->dax_mode = result.uint_32; break; default: return -EINVAL; @@ -1312,8 +1324,8 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; - if (ctx->dax) { - if (!fs->dax_dev) { + if (ctx->dax_mode != FUSE_DAX_NEVER) { + if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { err = -EINVAL; pr_err("virtio-fs: dax can't be enabled as filesystem" " device does not support it.\n"); From 73f68243f925ce9e2e62612c9d51c19fe50b5beb Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:26 +0800 Subject: [PATCH 036/161] fuse: support per inode DAX in fuse protocol Expand the fuse protocol to support per inode DAX. FUSE_HAS_INODE_DAX flag is added indicating if fuse server/client supporting per inode DAX. It can be conveyed in both FUSE_INIT request and reply. FUSE_ATTR_DAX flag is added indicating if DAX shall be enabled for corresponding file. It is conveyed in FUSE_LOOKUP reply. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit 98046f7486db723ec8bb99a950a4fa5f5be55cd1) (cherry picked from commit cc1f9f9e09ff486162233456e7d7199d45574a36) --- include/uapi/linux/fuse.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index cc4f26382ee4d3..dffc61ac594cae 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -193,6 +193,9 @@ * - add flags2 to fuse_init_in and fuse_init_out * - add FUSE_SECURITY_CTX init flag * - add security context to create, mkdir, symlink, and mknod requests + * - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX + * + * 7.38 * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry * - add FUSE_HAS_EXPIRE_ONLY */ @@ -355,6 +358,7 @@ struct fuse_file_lock { * FUSE_INIT_RESERVED: reserved, do not use * FUSE_SECURITY_CTX: add security context to create, mkdir, symlink, and * mknod + * FUSE_HAS_INODE_DAX: use per inode DAX * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation @@ -393,6 +397,7 @@ struct fuse_file_lock { #define FUSE_INIT_RESERVED (1 << 31) /* bits 32..63 get shifted down 32 bits into the flags2 field */ #define FUSE_SECURITY_CTX (1ULL << 32) +#define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) @@ -477,8 +482,10 @@ struct fuse_file_lock { * fuse_attr flags * * FUSE_ATTR_SUBMOUNT: Object is a submount root + * FUSE_ATTR_DAX: Enable DAX for this file in per inode DAX mode */ #define FUSE_ATTR_SUBMOUNT (1 << 0) +#define FUSE_ATTR_DAX (1 << 1) /** * Open flags From fe10fb6803c2d8b2b79de3f825a8d1d5e95b8bf5 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:27 +0800 Subject: [PATCH 037/161] fuse: enable per inode DAX DAX may be limited in some specific situation. When the number of usable DAX windows is under watermark, the recalim routine will be triggered to reclaim some DAX windows. It may have a negative impact on the performance, since some processes may need to wait for DAX windows to be recalimed and reused then. To mitigate the performance degradation, the overall DAX window need to be expanded larger. However, simply expanding the DAX window may not be a good deal in some scenario. To maintain one DAX window chunk (i.e., 2MB in size), 32KB (512 * 64 bytes) memory footprint will be consumed for page descriptors inside guest, which is greater than the memory footprint if it uses guest page cache when DAX disabled. Thus it'd better disable DAX for those files smaller than 32KB, to reduce the demand for DAX window and thus avoid the unworthy memory overhead. Per inode DAX feature is introduced to address this issue, by offering a finer grained control for dax to users, trying to achieve a balance between performance and memory overhead. The FUSE_ATTR_DAX flag in FUSE_LOOKUP reply is used to indicate whether DAX should be enabled or not for corresponding file. Currently the state whether DAX is enabled or not for the file is initialized only when inode is instantiated. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit 93a497b9ad695bb2f38a302c5b29dbc9b555ff3f) (cherry picked from commit 4dc1ce8b22fe7ec28ade009fe23728869ed46654) --- fs/fuse/dax.c | 12 ++++++++---- fs/fuse/file.c | 4 ++-- fs/fuse/fuse_i.h | 4 ++-- fs/fuse/inode.c | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 7d7e4fdbff5ec7..c346927b8c95fa 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1329,7 +1329,7 @@ static const struct address_space_operations fuse_dax_file_aops = { .dirty_folio = noop_dirty_folio, }; -static bool fuse_should_enable_dax(struct inode *inode) +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); enum fuse_dax_mode dax_mode = fc->dax_mode; @@ -1344,12 +1344,16 @@ static bool fuse_should_enable_dax(struct inode *inode) if (!fc->dax) return false; - return true; + if (dax_mode == FUSE_DAX_ALWAYS) + return true; + + /* dax_mode is FUSE_DAX_INODE* */ + return flags & FUSE_ATTR_DAX; } -void fuse_dax_inode_init(struct inode *inode) +void fuse_dax_inode_init(struct inode *inode, unsigned int flags) { - if (!fuse_should_enable_dax(inode)) + if (!fuse_should_enable_dax(inode, flags)) return; inode->i_flags |= S_DAX; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 97ced8bddf59a3..4a059927314d47 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3225,7 +3225,7 @@ static const struct address_space_operations fuse_file_aops = { .write_end = fuse_write_end, }; -void fuse_init_file_inode(struct inode *inode) +void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -3239,5 +3239,5 @@ void fuse_init_file_inode(struct inode *inode) fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_inode_init(inode); + fuse_dax_inode_init(inode, flags); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 336606fb95c63f..07d9548c237884 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1031,7 +1031,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, /** * Initialize file operations on a regular file */ -void fuse_init_file_inode(struct inode *inode); +void fuse_init_file_inode(struct inode *inode, unsigned int flags); /** * Initialize inode operations on regular files and special files @@ -1297,7 +1297,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); -void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_init(struct inode *inode, unsigned int flags); void fuse_dax_inode_cleanup(struct inode *inode); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0bfbd8865e7468..0c20add2fdfc33 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -319,7 +319,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); - fuse_init_file_inode(inode); + fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) From 8ba7c54817ded35b8294e557e4b15eca45717b4f Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:28 +0800 Subject: [PATCH 038/161] fuse: negotiate per inode DAX in FUSE_INIT Among the FUSE_INIT phase, client shall advertise per inode DAX if it's mounted with "dax=inode". Then server is aware that client is in per inode DAX mode, and will construct per-inode DAX attribute accordingly. Server shall also advertise support for per inode DAX. If server doesn't support it while client is mounted with "dax=inode", client will silently fallback to "dax=never" since "dax=inode" is advisory only. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit 2ee019fadcca343c3deea6a1767965bdf23fc3d0) (cherry picked from commit af31393a32193ea877e4b57d2f3d31ce87bc957a) --- fs/fuse/dax.c | 2 +- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 13 +++++++++---- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index c346927b8c95fa..48c2a4312b3205 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1348,7 +1348,7 @@ static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) return true; /* dax_mode is FUSE_DAX_INODE* */ - return flags & FUSE_ATTR_DAX; + return fc->inode_dax && (flags & FUSE_ATTR_DAX); } void fuse_dax_inode_init(struct inode *inode, unsigned int flags) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07d9548c237884..af72e92c5403f3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -786,6 +786,9 @@ struct fuse_conn { /* Add supplementary group info when creating a new inode */ unsigned int create_supp_group:1; + /* Does the filesystem support per inode DAX? */ + unsigned int inode_dax:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0c20add2fdfc33..8cea42628b8e55 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1177,10 +1177,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } - if (IS_ENABLED(CONFIG_FUSE_DAX) && - flags & FUSE_MAP_ALIGNMENT && - !fuse_dax_check_alignment(fc, arg->map_alignment)) { - ok = false; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; } if (flags & FUSE_HANDLE_KILLPRIV_V2) { fc->handle_killpriv_v2 = 1; @@ -1242,6 +1245,8 @@ void fuse_send_init(struct fuse_mount *fm) #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; From 809c8cf49412d7e29ac67650ddb55238307f263e Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 25 Nov 2021 15:05:29 +0800 Subject: [PATCH 039/161] fuse: mark inode DONT_CACHE when per inode DAX hint changes When the per inode DAX hint changes while the file is still *opened*, it is quite complicated and maybe fragile to dynamically change the DAX state. Hence mark the inode and corresponding dentries as DONE_CACHE once the per inode DAX hint changes, so that the inode instance will be evicted and freed as soon as possible once the file is closed and the last reference to the inode is put. And then when the file gets reopened next time, the new instantiated inode will reflect the new DAX state. In summary, when the per inode DAX hint changes for an *opened* file, the DAX state of the file won't be updated until this file is closed and reopened later. Signed-off-by: Jeffle Xu Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit c3cb6f935e322fa183988032e318b293d9e4fe53) (cherry picked from commit 1ea73aadd1e30e70e57ac0104dc8a43cce37a916) --- fs/fuse/dax.c | 9 +++++++++ fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 3 +++ 3 files changed, 13 insertions(+) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 48c2a4312b3205..10eb50cbf398c4 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1360,6 +1360,15 @@ void fuse_dax_inode_init(struct inode *inode, unsigned int flags) inode->i_data.a_ops = &fuse_dax_file_aops; } +void fuse_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_inode_dax_mode(fc->dax_mode) && + ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) + d_mark_dontcache(inode); +} + bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index af72e92c5403f3..fedaed77ef831f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1302,6 +1302,7 @@ void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); void fuse_dax_inode_init(struct inode *inode, unsigned int flags); void fuse_dax_inode_cleanup(struct inode *inode); +void fuse_dax_dontcache(struct inode *inode, unsigned int flags); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8cea42628b8e55..f5002039169f64 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -307,6 +307,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); } static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) From 6c5dcdfe7dd9fc512f9352b434a48431aab1017d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 20 Apr 2022 16:05:41 +0200 Subject: [PATCH 040/161] fuse: write inode in fuse_release() A race between write(2) and close(2) allows pages to be dirtied after fuse_flush -> write_inode_now(). If these pages are not flushed from fuse_release(), then there might not be a writable open file later. So any remaining dirty pages must be written back before the file is released. This is a partial revert of the blamed commit. Reported-by: syzbot+6e1efbd8efaaa6860e91@syzkaller.appspotmail.com Fixes: 36ea23374d1f ("fuse: write inode in fuse_vma_close() instead of fuse_release()") Cc: # v5.16 Signed-off-by: Miklos Szeredi (cherry picked from commit 035ff33cf4db101250fb980a3941bf078f37a544) (cherry picked from commit 3595c77bce5f085dc45a2430d9d0e3a3d635e983) --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4a059927314d47..3fe05126ff70df 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -342,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + fuse_release_common(file, false); /* return value is ignored by VFS */ From 7d3272fd3cd203d5c7588b462526d584f29d2c87 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Sat, 2 Apr 2022 18:32:50 +0800 Subject: [PATCH 041/161] fuse: avoid unnecessary spinlock bump Move dmap free worker kicker inside the critical region, so that extra spinlock lock/unlock could be avoided. Suggested-by: Liu Jiang Signed-off-by: Jeffle Xu Reviewed-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi (cherry picked from commit 47e301491c4f52ca744c7660cf57492b902261f6) (cherry picked from commit 37629c4c17f5bb8e6663a104c1a9bb8bdd3cb836) --- fs/fuse/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 10eb50cbf398c4..e23e802a80130a 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) WARN_ON(fcd->nr_free_ranges <= 0); fcd->nr_free_ranges--; } + __kick_dmap_free_worker(fcd, 0); spin_unlock(&fcd->lock); - kick_dmap_free_worker(fcd, 0); return dmap; } From 246d540afbca31480964c5b78e39b8a0720f30b8 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 18 Jul 2022 16:50:12 +0800 Subject: [PATCH 042/161] fuse: Remove the control interface for virtio-fs The commit 15c8e72e88e0 ("fuse: allow skipping control interface and forced unmount") tries to remove the control interface for virtio-fs since it does not support aborting requests which are being processed. But it doesn't work now. This patch fixes it by skipping creating the control interface if fuse_conn->no_control is set. Fixes: 15c8e72e88e0 ("fuse: allow skipping control interface and forced unmount") Signed-off-by: Xie Yongji Signed-off-by: Miklos Szeredi (cherry picked from commit c64797809a64c73497082aa05e401a062ec1af34) (cherry picked from commit 169e8726cf40022218946bcabd5a1a556c737da1) --- fs/fuse/control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4632e7e84ba61c..45b584a9ecbfd5 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -254,7 +254,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) struct dentry *parent; char name[32]; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; @@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) { int i; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return; for (i = fc->ctl_ndents - 1; i >= 0; i--) { From 9cc1f85493eeb111e0a729bc7af6aafa7a7e24cc Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 11 Jul 2022 10:48:08 -0700 Subject: [PATCH 043/161] fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other Since commit 73f03c2b4b52 ("fuse: Restrict allow_other to the superblock's namespace or a descendant"), access to allow_other FUSE filesystems has been limited to users in the mounting user namespace or descendants. This prevents a process that is privileged in its userns - but not its parent namespaces - from mounting a FUSE fs w/ allow_other that is accessible to processes in parent namespaces. While this restriction makes sense overall it breaks a legitimate usecase: I have a tracing daemon which needs to peek into process' open files in order to symbolicate - similar to 'perf'. The daemon is a privileged process in the root userns, but is unable to peek into FUSE filesystems mounted by processes in child namespaces. This patch adds a module param, allow_sys_admin_access, to act as an escape hatch for this descendant userns logic and for the allow_other mount option in general. Setting allow_sys_admin_access allows processes with CAP_SYS_ADMIN in the initial userns to access FUSE filesystems irrespective of the mounting userns or whether allow_other was set. A sysadmin setting this param must trust FUSEs on the host to not DoS processes as described in 73f03c2b4b52. Signed-off-by: Dave Marchevsky Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi (cherry picked from commit 9ccf47b26b73ecf5b7278a4cb8d487d8ebb4c095) (cherry picked from commit 6dd15c166914ec7531c66cb3ca69f2824c0879c3) --- Documentation/filesystems/fuse.rst | 29 ++++++++++++++++++++++++----- fs/fuse/dir.c | 9 +++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse.rst index 8120c3c0cb4e03..1e31e87aee68c5 100644 --- a/Documentation/filesystems/fuse.rst +++ b/Documentation/filesystems/fuse.rst @@ -279,7 +279,7 @@ How are requirements fulfilled? the filesystem or not. Note that the *ptrace* check is not strictly necessary to - prevent B/2/i, it is enough to check if mount owner has enough + prevent C/2/i, it is enough to check if mount owner has enough privilege to send signal to the process accessing the filesystem, since *SIGSTOP* can be used to get a similar effect. @@ -288,10 +288,29 @@ I think these limitations are unacceptable? If a sysadmin trusts the users enough, or can ensure through other measures, that system processes will never enter non-privileged -mounts, it can relax the last limitation with a 'user_allow_other' -config option. If this config option is set, the mounting user can -add the 'allow_other' mount option which disables the check for other -users' processes. +mounts, it can relax the last limitation in several ways: + + - With the 'user_allow_other' config option. If this config option is + set, the mounting user can add the 'allow_other' mount option which + disables the check for other users' processes. + + User namespaces have an unintuitive interaction with 'allow_other': + an unprivileged user - normally restricted from mounting with + 'allow_other' - could do so in a user namespace where they're + privileged. If any process could access such an 'allow_other' mount + this would give the mounting user the ability to manipulate + processes in user namespaces where they're unprivileged. For this + reason 'allow_other' restricts access to users in the same userns + or a descendant. + + - With the 'allow_sys_admin_access' module option. If this option is + set, super user's processes have unrestricted access to mounts + irrespective of allow_other setting or user namespace of the + mounting user. + +Note that both of these relaxations expose the system to potential +information leak or *DoS* as described in points B and C/2/i-ii in the +preceding section. Kernel - userspace interface ============================ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 095ea5458c8e56..ad16b2260a6fe0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,11 @@ #include #include +static bool __read_mostly allow_sys_admin_access; +module_param(allow_sys_admin_access, bool, 0644); +MODULE_PARM_DESC(allow_sys_admin_access, + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -1299,6 +1305,9 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + return 1; + if (fc->allow_other) return current_in_userns(fc->user_ns); From 370fb40fb44a3b7baf13ccb5a08054e5c80f818e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: [PATCH 044/161] fuse: implement ->tmpfile() This is basically equivalent to the FUSE_CREATE operation which creates and opens a regular file. Add a new FUSE_TMPFILE operation, otherwise just reuse the protocol and the code for FUSE_CREATE. Acked-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi (cherry picked from commit 7d37539037c2fca70346fbedc219f655253d5cff) (cherry picked from commit 568bb891fed7e8bfa3044e79e809b7b6baedc8c9) --- fs/fuse/dir.c | 24 +++++++++++++++++++++--- fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 6 +++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ad16b2260a6fe0..67cee6223e3f7b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -615,7 +615,7 @@ static void free_ext_value(struct fuse_args *args) */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, - umode_t mode) + umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -657,7 +657,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = FUSE_CREATE; + args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); @@ -753,7 +753,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode); + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -870,6 +870,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -1983,6 +2000,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fedaed77ef831f..01985c601fb591 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -789,6 +789,9 @@ struct fuse_conn { /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index dffc61ac594cae..8255b2e4b16429 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -195,6 +195,9 @@ * - add security context to create, mkdir, symlink, and mknod requests * - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX * + * 7.37 + * - add FUSE_TMPFILE + * * 7.38 * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry * - add FUSE_HAS_EXPIRE_ONLY @@ -233,7 +236,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 36 +#define FUSE_KERNEL_MINOR_VERSION 37 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -565,6 +568,7 @@ enum fuse_opcode { FUSE_SETUPMAPPING = 48, FUSE_REMOVEMAPPING = 49, FUSE_SYNCFS = 50, + FUSE_TMPFILE = 51, /* CUSE specific operations */ CUSE_INIT = 4096, From 9e27e2e472a51ab69a981da8982c6e82e419818f Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 12 Oct 2022 13:23:23 +0200 Subject: [PATCH 045/161] fs/fuse: Replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as the mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Therefore, replace kmap() with kmap_local_page() in fuse_readdir_cached(), it being the only call site of kmap() currently left in fs/fuse. Cc: "Venkataramanan, Anirudh" Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Miklos Szeredi (cherry picked from commit a1db2f7edef095a385d477ab81e780694d63eebd) (cherry picked from commit 44b36054ae068a3764d01b704555fffc380e82a5) --- fs/fuse/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index e8deaacf1832a1..dc603479b30efa 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -547,9 +547,9 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) * Contents of the page are now protected against changing by holding * the page lock. */ - addr = kmap(page); + addr = kmap_local_page(page); res = fuse_parse_cache(ff, addr, size, ctx); - kunmap(page); + kunmap_local(addr); unlock_page(page); put_page(page); From c2a20a910a1467bca27e4ae7733d27ee7942cd13 Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Wed, 28 Sep 2022 20:19:34 +0800 Subject: [PATCH 046/161] fuse: always revalidate rename target dentry The previous commit df8629af2934 ("fuse: always revalidate if exclusive create") ensures that the dentries are revalidated on O_EXCL creates. This commit complements it by also performing revalidation for rename target dentries. Otherwise, a rename target file that only exists in kernel dentry cache but not in the filesystem will result in EEXIST if RENAME_NOREPLACE flag is used. Signed-off-by: Jiachen Zhang Signed-off-by: Zhang Tianci Signed-off-by: Miklos Szeredi (cherry picked from commit ccc031e26afe60d2a5a3d93dabd9c978210825fb) (cherry picked from commit c08a7ab65763ae1cc628826e017f76cbf39beb63) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 67cee6223e3f7b..d9b496162e5aeb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; From 36ed6aaeb0ee40109fd0e737fad15a0838c9b30e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 14 Sep 2022 16:26:32 +0200 Subject: [PATCH 047/161] fuse: Remove user_ns check for FUSE_DEV_IOC_CLONE Commit 8ed1f0e22f49e ("fs/fuse: fix ioctl type confusion") fixed a type confusion bug by adding an ->f_op comparison. Based on some off-list discussion back then, another check was added to compare the f_cred->user_ns. This is not for security reasons, but was based on the idea that a FUSE device FD should be using the UID/GID mappings of its f_cred->user_ns, and those translations are done using fc->user_ns, which matches the f_cred->user_ns of the initial FUSE device FD thanks to the check in fuse_fill_super(). See also commit 8cb08329b0809 ("fuse: Support fuse filesystems outside of init_user_ns"). But FUSE_DEV_IOC_CLONE is, at a higher level, a *cloning* operation that copies an existing context (with a weird API that involves first opening /dev/fuse, then tying the resulting new FUSE device FD to an existing FUSE instance). So if an application is already passing FUSE FDs across userns boundaries and dealing with the resulting ID mapping complications somehow, it doesn't make much sense to block this cloning operation. I've heard that this check is an obstacle for some folks, and I don't see a good reason to keep it, so remove it. Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi (cherry picked from commit 0618021e34c6b1edc4fc754ec53ab7fdcc98aaec) (cherry picked from commit d235f3892b8a15aa04c1cab9aba49788213dfc0f) --- fs/fuse/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8bdb09a221b58a..3446706c35cd0a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2272,8 +2272,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, * Check against file->f_op because CUSE * uses the same ioctl handler. */ - if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) + if (old->f_op == file->f_op) fud = fuse_get_dev(old); if (fud) { From cae3db12cfb1d03fff94ad2b0c6da8b43f2a8cc5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Sep 2022 11:40:21 +0200 Subject: [PATCH 048/161] fuse: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Seth Forshee (Digital Ocean) Cc: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi (cherry picked from commit 00d369bc2de5986504fc0d60bfeae4c342b2cad5) (cherry picked from commit 643d9ba6fbd2875e77e1461a552c8be0b2913b8d) --- fs/fuse/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 52b165319be1fd..3449dfdf1a7d66 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -95,7 +95,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; From 4cbfa35a822579666d1a825e3f70862d759838ad Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Thu, 1 Sep 2022 07:42:59 +0000 Subject: [PATCH 049/161] fuse: remove the unneeded result variable Return the value fuse_dev_release() directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Signed-off-by: ye xingchen Signed-off-by: Miklos Szeredi (cherry picked from commit e2283a736676554f72dbdcb62fdc1d23daf7044f) (cherry picked from commit e99769bb8f519ea9e8ac51d908ad2088434ba892) --- fs/fuse/cuse.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index be65eb5d4393b7..5884c872db1ec2 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) { struct fuse_dev *fud = file->private_data; struct cuse_conn *cc = fc_to_cc(fud->fc); - int rc; /* remove from the conntbl, no more access from this point on */ mutex_lock(&cuse_lock); @@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file) cdev_del(cc->cdev); } - rc = fuse_dev_release(inode, file); /* puts the base reference */ - - return rc; + return fuse_dev_release(inode, file); } static struct file_operations cuse_channel_fops; /* initialized during init */ From b081e3903cb6dcb75a7da829f13e28ab8f5e4e70 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 25 Oct 2022 09:10:17 -0700 Subject: [PATCH 050/161] fuse: Rearrange fuse_allow_current_process checks This is a followup to a previous commit of mine [0], which added the allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch rearranges the order of checks in fuse_allow_current_process without changing functionality. Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other") added allow_sys_admin_access && capable(CAP_SYS_ADMIN) check to the beginning of the function, with the reasoning that allow_sys_admin_access should be an 'escape hatch' for users with CAP_SYS_ADMIN, allowing them to skip any subsequent checks. However, placing this new check first results in many capable() calls when allow_sys_admin_access is set, where another check would've also returned 1. This can be problematic when a BPF program is tracing capable() calls. At Meta we ran into such a scenario recently. On a host where allow_sys_admin_access is set but most of the FUSE access is from processes which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape hatch' - this results in an unnecessary capable() call for each fs op. We also have a daemon tracing capable() with BPF and doing some data collection, so tracing these extraneous capable() calls has the potential to regress performance for an application doing many FUSE ops. So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to make the logic easier to understand. Previously, if allow_other is set on the fuse_conn, uid/git checking doesn't happen as current_in_userns result is returned. These semantics are maintained here: fuse_permissible_uidgid check only happens if allow_other is not set. Signed-off-by: Dave Marchevsky Suggested-by: Andrii Nakryiko Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi (cherry picked from commit b138777786f780b9d5ce1989032608acbede0493) (cherry picked from commit 1cd4785df291b1f85c04b7ffd9fb2a2062c3fd43) --- fs/fuse/dir.c | 35 ++++++++++++++++++++--------------- fs/fuse/fuse_i.h | 2 +- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d9b496162e5aeb..ee3a46f24e2dfe 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1305,6 +1305,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, return err; } +static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) +{ + const struct cred *cred = current_cred(); + + return (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)); +} + /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This @@ -1318,26 +1330,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_current_process(struct fuse_conn *fc) +bool fuse_allow_current_process(struct fuse_conn *fc) { - const struct cred *cred; - - if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) - return 1; + bool allow; if (fc->allow_other) - return current_in_userns(fc->user_ns); + allow = current_in_userns(fc->user_ns); + else + allow = fuse_permissible_uidgid(fc); - cred = current_cred(); - if (uid_eq(cred->euid, fc->user_id) && - uid_eq(cred->suid, fc->user_id) && - uid_eq(cred->uid, fc->user_id) && - gid_eq(cred->egid, fc->group_id) && - gid_eq(cred->sgid, fc->group_id) && - gid_eq(cred->gid, fc->group_id)) - return 1; + if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + allow = true; - return 0; + return allow; } static int fuse_access(struct inode *inode, int mask) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 01985c601fb591..2d63adcb2af8b2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1184,7 +1184,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr); /** * Is current process allowed to perform filesystem operation? */ -int fuse_allow_current_process(struct fuse_conn *fc); +bool fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); From eaa08941b2632d8e7866953fdc975ec646399f36 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jan 2023 17:10:58 +0100 Subject: [PATCH 051/161] fuse: in fuse_flush only wait if someone wants the return code If a fuse filesystem is mounted inside a container, there is a problem during pid namespace destruction. The scenario is: 1. task (a thread in the fuse server, with a fuse file open) starts exiting, does exit_signals(), goes into fuse_flush() -> wait 2. fuse daemon gets killed, tries to wake everyone up 3. task from 1 is stuck because complete_signal() doesn't wake it up, since it has PF_EXITING. The result is that the thread will never be woken up, and pid namespace destruction will block indefinitely. To add insult to injury, nobody is waiting for these return codes, since the pid namespace is being destroyed. To fix this, let's not block on flush operations when the current task has PF_EXITING. This does change the semantics slightly: the wait here is for posix locks to be unlocked, so the task will exit before things are unlocked. To quote Miklos: "remote" posix locks are almost never used due to problems like this, so I think it's safe to do this. Signed-off-by: "Eric W. Biederman" Signed-off-by: Tycho Andersen Link: https://lore.kernel.org/all/YrShFXRLtRt6T%2Fj+@risky/ Tested-by: Tycho Andersen Signed-off-by: Miklos Szeredi (cherry picked from commit 5a8bee63b10f6f2f52f6d22e109a4a147409842a) (cherry picked from commit ee384bb37d582ce5269cd2caddf63f0d058830a0) --- fs/fuse/file.c | 89 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3fe05126ff70df..c768f9ea15e7a4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,6 +18,7 @@ #include #include #include +#include static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -477,48 +478,36 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -static int fuse_flush(struct file *file, fl_owner_t id) -{ - struct inode *inode = file_inode(file); - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_file *ff = file->private_data; +struct fuse_flush_args { + struct fuse_args args; struct fuse_flush_in inarg; - FUSE_ARGS(args); - int err; - - if (fuse_is_bad(inode)) - return -EIO; + struct work_struct work; + struct file *file; +}; - if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) - return 0; +static int fuse_do_flush(struct fuse_flush_args *fa) +{ + int err; + struct inode *inode = file_inode(fa->file); + struct fuse_mount *fm = get_fuse_mount(inode); err = write_inode_now(inode, 1); if (err) - return err; + goto out; inode_lock(inode); fuse_sync_writes(inode); inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); + err = filemap_check_errors(fa->file->f_mapping); if (err) - return err; + goto out; err = 0; if (fm->fc->no_flush) goto inval_attr_out; - memset(&inarg, 0, sizeof(inarg)); - inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); - args.opcode = FUSE_FLUSH; - args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.force = true; - - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(fm, &fa->args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -531,9 +520,57 @@ static int fuse_flush(struct file *file, fl_owner_t id) */ if (!err && fm->fc->writeback_cache) fuse_invalidate_attr_mask(inode, STATX_BLOCKS); + +out: + fput(fa->file); + kfree(fa); return err; } +static void fuse_flush_async(struct work_struct *work) +{ + struct fuse_flush_args *fa = container_of(work, typeof(*fa), work); + + fuse_do_flush(fa); +} + +static int fuse_flush(struct file *file, fl_owner_t id) +{ + struct fuse_flush_args *fa; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + + if (fuse_is_bad(inode)) + return -EIO; + + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; + + fa = kzalloc(sizeof(*fa), GFP_KERNEL); + if (!fa) + return -ENOMEM; + + fa->inarg.fh = ff->fh; + fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + fa->args.opcode = FUSE_FLUSH; + fa->args.nodeid = get_node_id(inode); + fa->args.in_numargs = 1; + fa->args.in_args[0].size = sizeof(fa->inarg); + fa->args.in_args[0].value = &fa->inarg; + fa->args.force = true; + fa->file = get_file(file); + + /* Don't wait if the task is exiting */ + if (current->flags & PF_EXITING) { + INIT_WORK(&fa->work, fuse_flush_async); + schedule_work(&fa->work); + return 0; + } + + return fuse_do_flush(fa); +} + int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { From 788a6c66b637e5270e070f2f760d848de466a63e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 8 Jan 2023 17:00:23 -0800 Subject: [PATCH 052/161] fuse: fix all W=1 kernel-doc warnings Use correct function name in kernel-doc notation. (1) Don't use "/**" to begin non-kernel-doc comments. (3) Fixes these warnings: fs/fuse/cuse.c:272: warning: expecting prototype for cuse_parse_dev_info(). Prototype was for cuse_parse_devinfo() instead fs/fuse/dev.c:212: warning: expecting prototype for A new request is available, wake fiq(). Prototype was for fuse_dev_wake_and_unlock() instead fs/fuse/dir.c:149: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Mark the attributes as stale due to an atime change. Avoid the invalidate if fs/fuse/file.c:656: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * In case of short read, the caller sets 'pos' to the position of Signed-off-by: Randy Dunlap Signed-off-by: Miklos Szeredi (cherry picked from commit 06bbb761c12dd147e4499f4d7a187699c5a0391f) (cherry picked from commit 0d5977bfb8719f931d3a379c7e9ebd2a03d7b500) --- fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 2 +- fs/fuse/dir.c | 2 +- fs/fuse/file.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 5884c872db1ec2..91e89e68177ee4 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -256,7 +256,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) } /** - * cuse_parse_dev_info - parse device info + * cuse_parse_devinfo - parse device info * @p: device info string * @len: length of device info string * @devinfo: out parameter for parsed device info diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3446706c35cd0a..e25bc3ac84bd12 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -204,7 +204,7 @@ static unsigned int fuse_req_hash(u64 unique) return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } -/** +/* * A new request is available, wake fiq->waitq */ static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ee3a46f24e2dfe..3894e1bc9dbb85 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -145,7 +145,7 @@ static void fuse_dir_changed(struct inode *dir) inode_maybe_inc_iversion(dir, false); } -/** +/* * Mark the attributes as stale due to an atime change. Avoid the invalidate if * atime is not used. */ diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c768f9ea15e7a4..a748fee8bf9116 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -689,7 +689,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) return io->bytes < 0 ? io->size : io->bytes; } -/** +/* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. From 3e147bf068525fb07a40a49639c184abe77abb8a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 26 Jan 2023 11:23:18 +0100 Subject: [PATCH 053/161] fuse: add inode/permission checks to fileattr_get/fileattr_set It looks like these checks were accidentally lost during the conversion to fileattr API. Fixes: 72227eac177d ("fuse: convert to fileattr") Cc: # v5.13 Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi (cherry picked from commit 1cc4606d19e3710bfab3f6704b87ff9580493c69) (cherry picked from commit ca5d1a3d35c74c57e11c38bdd011dc4f1f368c31) --- fs/fuse/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index cfa0ecf844297f..43dba5f81487d6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -424,6 +424,12 @@ static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) struct fuse_mount *fm = get_fuse_mount(inode); bool isdir = S_ISDIR(inode->i_mode); + if (!fuse_allow_current_process(fm->fc)) + return ERR_PTR(-EACCES); + + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); From 9c451f383b4008f5dcff9cb86e6f3d9deedf3035 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 21 Aug 2022 14:02:15 -0400 Subject: [PATCH 054/161] fuse_dev_ioctl(): switch to fdget() Reviewed-by: Christian Brauner Signed-off-by: Al Viro (cherry picked from commit 4a892c0fe4bb0546d68a89fa595bd22cb4be2576) (cherry picked from commit 1c1bd63225573442584363cfbbda0c2faafd29d9) --- fs/fuse/dev.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e25bc3ac84bd12..8573d79ef29c80 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2259,30 +2259,31 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, int res; int oldfd; struct fuse_dev *fud = NULL; + struct fd f; switch (cmd) { case FUSE_DEV_IOC_CLONE: - res = -EFAULT; - if (!get_user(oldfd, (__u32 __user *)arg)) { - struct file *old = fget(oldfd); - - res = -EINVAL; - if (old) { - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (old->f_op == file->f_op) - fud = fuse_get_dev(old); - - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fput(old); - } + if (get_user(oldfd, (__u32 __user *)arg)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); } + fdput(f); break; default: res = -ENOTTY; From 6ea1dcfe78dd7270233067687b0e5a280c5a5c9b Mon Sep 17 00:00:00 2001 From: zyfjeff Date: Tue, 13 Dec 2022 19:51:47 +0800 Subject: [PATCH 055/161] fuse: remove duplicate check for nodeid before this check, the nodeid has already been checked once, so the check here doesn't make an sense, so remove the check for nodeid here. if (err || !outarg->nodeid) goto out_put_forget; err = -EIO; >>> if (!outarg->nodeid) goto out_put_forget; Signed-off-by: zyfjeff Signed-off-by: Miklos Szeredi (cherry picked from commit 1c3610d30e5c15f4db7eb8465e311b582aa50ebe) (cherry picked from commit 5e4bed78815f3e02a0b48f4b7e7a5973c6f8826e) --- fs/fuse/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3894e1bc9dbb85..8eba92e7d92e5f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -395,8 +395,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; err = -EIO; - if (!outarg->nodeid) - goto out_put_forget; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; From f5f9b70d77cbfd3ae7582a23285c8fd8a03a2f7c Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 15 Apr 2022 13:53:56 +0200 Subject: [PATCH 056/161] fuse: Apply flags2 only when userspace set the FUSE_INIT_EXT This is just a safety precaution to avoid checking flags on memory that was initialized on the user space side. libfuse zeroes struct fuse_init_out outarg, but this is not guranteed to be done in all implementations. Better is to act on flags and to only apply flags2 when FUSE_INIT_EXT is set. There is a risk with this change, though - it might break existing user space libraries, which are already using flags2 without setting FUSE_INIT_EXT. The corresponding libfuse patch is here https://github.com/libfuse/libfuse/pull/662 Signed-off-by: Bernd Schubert Fixes: 53db28933e95 ("fuse: extend init flags") Cc: # v5.17 Signed-off-by: Miklos Szeredi (cherry picked from commit 3066ff93476c35679cb07a97cce37d9bb07632ff) (cherry picked from commit 12136574e4b42aa612fa392f599f9919ba5a42b8) --- fs/fuse/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f5002039169f64..d3f39e4aac3471 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1122,7 +1122,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { - u64 flags = arg->flags | (u64) arg->flags2 << 32; + u64 flags = arg->flags; + + if (flags & FUSE_INIT_EXT) + flags |= (u64) arg->flags2 << 32; ra_pages = arg->max_readahead / PAGE_SIZE; if (flags & FUSE_ASYNC_READ) From 3564b861526617434454bb0b47982e85026ae78b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 7 Jun 2023 17:49:20 +0200 Subject: [PATCH 057/161] fuse: revalidate: don't invalidate if interrupted If the LOOKUP request triggered from fuse_dentry_revalidate() is interrupted, then the dentry will be invalidated, possibly resulting in submounts being unmounted. Reported-by: Xu Rongbo Closes: https://lore.kernel.org/all/CAJfpegswN_CJJ6C3RZiaK6rpFmNyWmXfaEpnQUJ42KCwNF5tWw@mail.gmail.com/ Fixes: 9e6268db496a ("[PATCH] FUSE - read-write operations") Cc: Signed-off-by: Miklos Szeredi (cherry picked from commit a9d1c4c6df0e568207907c04aed9e7beb1294c42) (cherry picked from commit 395b184b66646cc80472fefc78c1396daa818350) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8eba92e7d92e5f..0609fde3ace505 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -258,7 +258,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) spin_unlock(&fi->lock); } kfree(forget); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || fuse_stale_inode(inode, outarg.generation, &outarg.attr)) From 89cccdc4a91c025c0e4c291358d38efe52a7e741 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 Aug 2023 13:05:30 +0200 Subject: [PATCH 058/161] Revert "fuse: in fuse_flush only wait if someone wants the return code" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5a8bee63b10f6f2f52f6d22e109a4a147409842a. Jürg Billeter reports the following regression: Since v6.3-rc1 commit 5a8bee63b1 ("fuse: in fuse_flush only wait if someone wants the return code") `fput()` is called asynchronously if a file is closed as part of a process exiting, i.e., if there was no explicit `close()` before exit. If the file was open for writing, also `put_write_access()` is called asynchronously as part of the async `fput()`. If that newly written file is an executable, attempting to `execve()` the new file can fail with `ETXTBSY` if it's called after the writer process exited but before the async `fput()` has run. Reported-and-tested-by: "Jürg Billeter" Cc: # v6.3 Link: https://lore.kernel.org/all/4f66cded234462964899f2a661750d6798a57ec0.camel@bitron.ch/ Signed-off-by: Miklos Szeredi (cherry picked from commit 91ec6c85599b60c00caf4e9a9d6c4d6e5dd5e93c) (cherry picked from commit 2dd5d5a152bd35ffb90ca57a4d65390158ac4f46) --- fs/fuse/file.c | 89 +++++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 63 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a748fee8bf9116..099ce4d5faed36 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,7 +18,6 @@ #include #include #include -#include static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -478,36 +477,48 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -struct fuse_flush_args { - struct fuse_args args; - struct fuse_flush_in inarg; - struct work_struct work; - struct file *file; -}; - -static int fuse_do_flush(struct fuse_flush_args *fa) +static int fuse_flush(struct file *file, fl_owner_t id) { - int err; - struct inode *inode = file_inode(fa->file); + struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + struct fuse_flush_in inarg; + FUSE_ARGS(args); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; err = write_inode_now(inode, 1); if (err) - goto out; + return err; inode_lock(inode); fuse_sync_writes(inode); inode_unlock(inode); - err = filemap_check_errors(fa->file->f_mapping); + err = filemap_check_errors(file->f_mapping); if (err) - goto out; + return err; err = 0; if (fm->fc->no_flush) goto inval_attr_out; - err = fuse_simple_request(fm, &fa->args); + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -520,57 +531,9 @@ static int fuse_do_flush(struct fuse_flush_args *fa) */ if (!err && fm->fc->writeback_cache) fuse_invalidate_attr_mask(inode, STATX_BLOCKS); - -out: - fput(fa->file); - kfree(fa); return err; } -static void fuse_flush_async(struct work_struct *work) -{ - struct fuse_flush_args *fa = container_of(work, typeof(*fa), work); - - fuse_do_flush(fa); -} - -static int fuse_flush(struct file *file, fl_owner_t id) -{ - struct fuse_flush_args *fa; - struct inode *inode = file_inode(file); - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_file *ff = file->private_data; - - if (fuse_is_bad(inode)) - return -EIO; - - if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) - return 0; - - fa = kzalloc(sizeof(*fa), GFP_KERNEL); - if (!fa) - return -ENOMEM; - - fa->inarg.fh = ff->fh; - fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); - fa->args.opcode = FUSE_FLUSH; - fa->args.nodeid = get_node_id(inode); - fa->args.in_numargs = 1; - fa->args.in_args[0].size = sizeof(fa->inarg); - fa->args.in_args[0].value = &fa->inarg; - fa->args.force = true; - fa->file = get_file(file); - - /* Don't wait if the task is exiting */ - if (current->flags & PF_EXITING) { - INIT_WORK(&fa->work, fuse_flush_async); - schedule_work(&fa->work); - return 0; - } - - return fuse_do_flush(fa); -} - int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { From 5d43fdd30d54483b00bc7e58c14f9dd77430d529 Mon Sep 17 00:00:00 2001 From: ruanmeisi Date: Tue, 25 Apr 2023 19:13:54 +0800 Subject: [PATCH 059/161] fuse: nlookup missing decrement in fuse_direntplus_link During our debugging of glusterfs, we found an Assertion failed error: inode_lookup >= nlookup, which was caused by the nlookup value in the kernel being greater than that in the FUSE file system. The issue was introduced by fuse_direntplus_link, where in the function, fuse_iget increments nlookup, and if d_splice_alias returns failure, fuse_direntplus_link returns failure without decrementing nlookup https://github.com/gluster/glusterfs/pull/4081 Signed-off-by: ruanmeisi Fixes: 0b05b18381ee ("fuse: implement NFS-like readdirplus support") Cc: # v3.9 Signed-off-by: Miklos Szeredi (cherry picked from commit b8bd342d50cbf606666488488f9fea374aceb2d5) (cherry picked from commit 1e097696ab6f8cafcb483666caeb048209020222) --- fs/fuse/readdir.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index dc603479b30efa..b3d498163f973f 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -243,8 +243,16 @@ static int fuse_direntplus_link(struct file *file, dput(dentry); dentry = alias; } - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (!IS_ERR(inode)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->nlookup--; + spin_unlock(&fi->lock); + } return PTR_ERR(dentry); + } } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); From 62af8a67d362425c38f434e732d9f81be0601edf Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 1 Aug 2023 16:06:45 +0800 Subject: [PATCH 060/161] fuse: invalidate page cache pages before direct write In FOPEN_DIRECT_IO, page cache may still be there for a file since private mmap is allowed. Direct write should respect that and invalidate the corresponding pages so that page cache readers don't get stale data. Signed-off-by: Hao Xu Tested-by: Jiachen Zhang Signed-off-by: Miklos Szeredi (cherry picked from commit 80e4f25262f9f1a5f08154fafaa6ac4371043d35) (cherry picked from commit 6e4458f9dbe5b3b5286e76c4c9fa129d36ba04d1) --- fs/fuse/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 099ce4d5faed36..417cecdb6aa3bc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1450,7 +1450,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; @@ -1462,6 +1463,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; struct fuse_io_args *ia; unsigned int max_pages; + bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; max_pages = iov_iter_npages(iter, fc->max_pages); ia = fuse_io_alloc(io, max_pages); @@ -1476,6 +1478,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } + if (fopen_direct_io && write) { + res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); + if (res) { + fuse_io_free(ia); + return res; + } + } + io->should_dirty = !write && user_backed_iter(iter); while (count) { ssize_t nres; From d4a6bec43e423509d610667978c5dbcfc4956ab6 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 1 Aug 2023 16:06:46 +0800 Subject: [PATCH 061/161] fuse: add a new fuse init flag to relax restrictions in no cache mode FOPEN_DIRECT_IO is usually set by fuse daemon to indicate need of strong coherency, e.g. network filesystems. Thus shared mmap is disabled since it leverages page cache and may write to it, which may cause inconsistence. But FOPEN_DIRECT_IO can be used not for coherency but to reduce memory footprint as well, e.g. reduce guest memory usage with virtiofs. Therefore, add a new fuse init flag FUSE_DIRECT_IO_RELAX to relax restrictions in that mode, currently, it allows shared mmap. One thing to note is to make sure it doesn't break coherency in your use case. Signed-off-by: Hao Xu Signed-off-by: Miklos Szeredi (cherry picked from commit e78662e818f9478e70912cc2970ca632ec9f3635) (cherry picked from commit 7edb9e9df8fd0276e42769eaefa69dadc89bd64b) --- fs/fuse/file.c | 7 +++++-- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 4 +++- include/uapi/linux/fuse.h | 8 +++++++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 417cecdb6aa3bc..975525e436664a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2473,14 +2473,17 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) + /* Can't provide the coherency needed for MAP_SHARED + * if FUSE_DIRECT_IO_RELAX isn't set. + */ + if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax) return -ENODEV; invalidate_inode_pages2(file->f_mapping); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2d63adcb2af8b2..6adf510ee331cb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -792,6 +792,9 @@ struct fuse_conn { /* Is tmpfile not implemented by fs? */ unsigned int no_tmpfile:1; + /* relax restrictions in FOPEN_DIRECT_IO mode */ + unsigned int direct_io_relax:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d3f39e4aac3471..22c1288086a126 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1201,6 +1201,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->init_security = 1; if (flags & FUSE_CREATE_SUPP_GROUP) fc->create_supp_group = 1; + if (flags & FUSE_DIRECT_IO_RELAX) + fc->direct_io_relax = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1247,7 +1249,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 8255b2e4b16429..36fdd04683d5f3 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -201,6 +201,9 @@ * 7.38 * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry * - add FUSE_HAS_EXPIRE_ONLY + * + * 7.39 + * - add FUSE_DIRECT_IO_RELAX */ #ifndef _LINUX_FUSE_H @@ -236,7 +239,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 37 +#define FUSE_KERNEL_MINOR_VERSION 39 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -365,6 +368,8 @@ struct fuse_file_lock { * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation + * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now + * allow shared mmap */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -403,6 +408,7 @@ struct fuse_file_lock { #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) +#define FUSE_DIRECT_IO_RELAX (1ULL << 36) /** * CUSE INIT request/reply flags From 521edf26b80878d7a61b4452210007dd470059a5 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 1 Aug 2023 16:06:47 +0800 Subject: [PATCH 062/161] fuse: write back dirty pages before direct write in direct_io_relax mode In direct_io_relax mode, there can be shared mmaped files and thus dirty pages in its page cache. Therefore those dirty pages should be written back to backend before direct io to avoid data loss. Signed-off-by: Hao Xu Reviewed-by: Jiachen Zhang Signed-off-by: Miklos Szeredi (cherry picked from commit b5a2a3a0b77668257fa72ee6bc0eac90493f13c1) (cherry picked from commit 9e1e0326017dca6add2cf8cb1af9b194dadd2b09) --- fs/fuse/file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 975525e436664a..80537718b5f4ab 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1470,6 +1470,13 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; + if (fopen_direct_io && fc->direct_io_relax) { + res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); + if (res) { + fuse_io_free(ia); + return res; + } + } if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); From 55a1867928d1ceeddf8263541f8f3de6e8811583 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Aug 2023 12:45:05 +0200 Subject: [PATCH 063/161] fuse: handle empty request_mask in statx If no attribute is requested, then don't send request to userspace. Signed-off-by: Miklos Szeredi (cherry picked from commit 8d8f9c4b8df6bc2bf005c91b73b23a0e60f0e413) (cherry picked from commit 3553f579c3477dedab9081423bb8616286aabace) --- fs/fuse/dir.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0609fde3ace505..bb8d3c2ef855a6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1209,7 +1209,12 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, u32 inval_mask = READ_ONCE(fi->inval_mask); u32 cache_mask = fuse_get_cache_mask(inode); - if (flags & AT_STATX_FORCE_SYNC) + /* FUSE only supports basic stats */ + request_mask &= STATX_BASIC_STATS; + + if (!request_mask) + sync = false; + else if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; From 80c7fa909a48f467b873b7eddc270df4b5fd95fc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Aug 2023 12:45:05 +0200 Subject: [PATCH 064/161] fuse: add STATX request Use the same structure as statx. Signed-off-by: Miklos Szeredi (cherry picked from commit ba58a37c2847f21494a04240fb48955cbd5d1aca) (cherry picked from commit 996eb397e2a0dc931bdef7c9319542866664d856) --- include/uapi/linux/fuse.h | 52 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 36fdd04683d5f3..b1ec0da99588a5 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -204,6 +204,7 @@ * * 7.39 * - add FUSE_DIRECT_IO_RELAX + * - add FUSE_STATX and related structures */ #ifndef _LINUX_FUSE_H @@ -266,6 +267,40 @@ struct fuse_attr { uint32_t flags; }; +/* + * The following structures are bit-for-bit compatible with the statx(2) ABI in + * Linux. + */ +struct fuse_sx_time { + int64_t tv_sec; + uint32_t tv_nsec; + int32_t __reserved; +}; + +struct fuse_statx { + uint32_t mask; + uint32_t blksize; + uint64_t attributes; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint16_t mode; + uint16_t __spare0[1]; + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint64_t attributes_mask; + struct fuse_sx_time atime; + struct fuse_sx_time btime; + struct fuse_sx_time ctime; + struct fuse_sx_time mtime; + uint32_t rdev_major; + uint32_t rdev_minor; + uint32_t dev_major; + uint32_t dev_minor; + uint64_t __spare2[14]; +}; + struct fuse_kstatfs { uint64_t blocks; uint64_t bfree; @@ -575,6 +610,7 @@ enum fuse_opcode { FUSE_REMOVEMAPPING = 49, FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, + FUSE_STATX = 52, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -639,6 +675,22 @@ struct fuse_attr_out { struct fuse_attr attr; }; +struct fuse_statx_in { + uint32_t getattr_flags; + uint32_t reserved; + uint64_t fh; + uint32_t sx_flags; + uint32_t sx_mask; +}; + +struct fuse_statx_out { + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t attr_valid_nsec; + uint32_t flags; + uint64_t spare[2]; + struct fuse_statx stat; +}; + #define FUSE_COMPAT_MKNOD_IN_SIZE 8 struct fuse_mknod_in { From 16a000e275906b6c6504ded27d21b68d79e5f913 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Aug 2023 12:45:05 +0200 Subject: [PATCH 065/161] fuse: add ATTR_TIMEOUT macro Next patch will introduce yet another type attribute reply. Add a macro that can handle attribute timeouts for all of the structs. Signed-off-by: Miklos Szeredi (cherry picked from commit 9dc10a54abe50b733a5b561d5f8be718e79c3590) (cherry picked from commit 1fcebbf8c1926804c893a4afbae538afe0a9454b) --- fs/fuse/dir.c | 26 ++++++++------------------ fs/fuse/fuse_i.h | 5 ++++- fs/fuse/readdir.c | 4 ++-- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bb8d3c2ef855a6..c678c64bbb85cc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -92,7 +92,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(u64 sec, u32 nsec) +u64 fuse_time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { struct timespec64 ts = { @@ -112,17 +112,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, - time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); -} - -static u64 attr_timeout(struct fuse_attr_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); -} - -u64 entry_attr_timeout(struct fuse_entry_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); + fuse_time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) @@ -266,7 +256,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, - entry_attr_timeout(&outarg), + ATTR_TIMEOUT(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { @@ -399,7 +389,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, - &outarg->attr, entry_attr_timeout(outarg), + &outarg->attr, ATTR_TIMEOUT(outarg), attr_version); err = -ENOMEM; if (!*inode) { @@ -686,7 +676,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, entry_attr_timeout(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -813,7 +803,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, entry_attr_timeout(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -1190,7 +1180,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, - attr_timeout(&outarg), + ATTR_TIMEOUT(&outarg), attr_version); if (stat) fuse_fillattr(inode, &outarg.attr, stat); @@ -1867,7 +1857,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg), + ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6adf510ee331cb..d16ae4e5f87085 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1114,7 +1114,10 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); -u64 entry_attr_timeout(struct fuse_entry_out *o); +u64 fuse_time_to_jiffies(u64 sec, u32 nsec); +#define ATTR_TIMEOUT(o) \ + fuse_time_to_jiffies((o)->attr_valid, (o)->attr_valid_nsec) + void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); /** diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index b3d498163f973f..c58447be5e4dc8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -224,7 +224,7 @@ static int fuse_direntplus_link(struct file *file, forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), + ATTR_TIMEOUT(o), attr_version); /* * The other branch comes via fuse_iget() @@ -232,7 +232,7 @@ static int fuse_direntplus_link(struct file *file, */ } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), + &o->attr, ATTR_TIMEOUT(o), attr_version); if (!inode) inode = ERR_PTR(-ENOMEM); From 220b57522cb6092df9f2c9fb2f17f51b778a3bda Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Aug 2023 12:45:05 +0200 Subject: [PATCH 066/161] fuse: implement statx Allow querying btime. When btime is requested in mask, then FUSE_STATX request is sent. Otherwise keep using FUSE_GETATTR. The userspace interface for statx matches that of the statx(2) API. However there are limitations on how this interface is used: - returned basic stats and btime are used, stx_attributes, etc. are ignored - always query basic stats and btime, regardless of what was requested - requested sync type is ignored, the default is passed to the server - if server returns with some attributes missing from the result_mask, then no attributes will be cached - btime is not cached yet (next patch will fix that) For new inodes initialize fi->inval_mask to "all invalid", instead of "all valid" as previously. Also only clear basic stats from inval_mask when caching attributes. This will result in the caching logic not thinking that btime is cached. Signed-off-by: Miklos Szeredi (cherry picked from commit d3045530bdd29d91033eea437d8a961f4ee598b5) (cherry picked from commit e1f9c0eacb7d01b985ace03465c5a19f4f76fbea) --- fs/fuse/dir.c | 106 ++++++++++++++++++++++++++++++++++++++++++++--- fs/fuse/fuse_i.h | 3 ++ fs/fuse/inode.c | 5 ++- 3 files changed, 107 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c678c64bbb85cc..f2a011ab1b311c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -350,10 +350,14 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } +static bool fuse_valid_size(u64 size) +{ + return size <= LLONG_MAX; +} + bool fuse_invalid_attr(struct fuse_attr *attr) { - return !fuse_valid_type(attr->mode) || - attr->size > LLONG_MAX; + return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size); } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, @@ -1143,6 +1147,84 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->blksize = 1 << blkbits; } +static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->ino = sx->ino; + attr->size = sx->size; + attr->blocks = sx->blocks; + attr->atime = sx->atime.tv_sec; + attr->mtime = sx->mtime.tv_sec; + attr->ctime = sx->ctime.tv_sec; + attr->atimensec = sx->atime.tv_nsec; + attr->mtimensec = sx->mtime.tv_nsec; + attr->ctimensec = sx->ctime.tv_nsec; + attr->mode = sx->mode; + attr->nlink = sx->nlink; + attr->uid = sx->uid; + attr->gid = sx->gid; + attr->rdev = new_encode_dev(MKDEV(sx->rdev_major, sx->rdev_minor)); + attr->blksize = sx->blksize; +} + +static int fuse_do_statx(struct inode *inode, struct file *file, + struct kstat *stat) +{ + int err; + struct fuse_attr attr; + struct fuse_statx *sx; + struct fuse_statx_in inarg; + struct fuse_statx_out outarg; + struct fuse_mount *fm = get_fuse_mount(inode); + u64 attr_version = fuse_get_attr_version(fm->fc); + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + /* For now leave sync hints as the default, request all stats. */ + inarg.sx_flags = 0; + inarg.sx_mask = STATX_BASIC_STATS | STATX_BTIME; + args.opcode = FUSE_STATX; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err) + return err; + + sx = &outarg.stat; + if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || + ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || + inode_wrong_type(inode, sx->mode)))) { + make_bad_inode(inode); + return -EIO; + } + + fuse_statx_to_attr(&outarg.stat, &attr); + if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) { + fuse_change_attributes(inode, &attr, ATTR_TIMEOUT(&outarg), + attr_version); + } + stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); + stat->btime.tv_sec = sx->btime.tv_sec; + stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + fuse_fillattr(inode, &attr, stat); + stat->result_mask |= STATX_TYPE; + + return 0; +} + static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct file *file) { @@ -1194,13 +1276,18 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); int err = 0; bool sync; u32 inval_mask = READ_ONCE(fi->inval_mask); u32 cache_mask = fuse_get_cache_mask(inode); - /* FUSE only supports basic stats */ - request_mask &= STATX_BASIC_STATS; + + /* FUSE only supports basic stats and possibly btime */ + request_mask &= STATX_BASIC_STATS | STATX_BTIME; +retry: + if (fc->no_statx) + request_mask &= STATX_BASIC_STATS; if (!request_mask) sync = false; @@ -1215,7 +1302,16 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, if (sync) { forget_all_cached_acls(inode); - err = fuse_do_getattr(inode, stat, file); + /* Try statx if BTIME is requested */ + if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { + err = fuse_do_statx(inode, file, stat); + if (err == -ENOSYS) { + fc->no_statx = 1; + goto retry; + } + } else { + err = fuse_do_getattr(inode, stat, file); + } } else if (stat) { generic_fillattr(&init_user_ns, inode, stat); stat->mode = fi->orig_i_mode; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d16ae4e5f87085..c528a15d84ed41 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -795,6 +795,9 @@ struct fuse_conn { /* relax restrictions in FOPEN_DIRECT_IO mode */ unsigned int direct_io_relax:1; + /* Is statx not implemented by fs? */ + unsigned int no_statx:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 22c1288086a126..7edd4ca86f59a8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -78,7 +78,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return NULL; fi->i_time = 0; - fi->inval_mask = 0; + fi->inval_mask = ~0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; @@ -173,7 +173,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - WRITE_ONCE(fi->inval_mask, 0); + /* Clear basic stats from invalid mask */ + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); From 47acabb2d9ee16ab228bb0602ff9cf75d15be97e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Aug 2023 12:45:05 +0200 Subject: [PATCH 067/161] fuse: cache btime Not all inode attributes are supported by all filesystems, but for the basic stats (which are returned by stat(2) and friends) all of them will have some value, even if that doesn't reflect a real attribute of the file. Btime is different, in that filesystems are free to report or not report a value in statx. If the value is available, then STATX_BTIME bit is set in stx_mask. When caching the value of btime, remember the availability of the attribute as well as the value (if available). This is done by using the FUSE_I_BTIME bit in fuse_inode->state to indicate availability, while using fuse_inode->inval_mask & STATX_BTIME to indicate the state of the cache itself (i.e. set if cache is invalid, and cleared if cache is valid). Signed-off-by: Miklos Szeredi (cherry picked from commit 972f4c46d0a1bb7fde3ce0bd15775855b2d02c68) (cherry picked from commit 7292fab2fe6ea7db464e70bfd005e57ea9103e9a) --- fs/fuse/dir.c | 14 +++++++++----- fs/fuse/fuse_i.h | 7 +++++++ fs/fuse/inode.c | 25 +++++++++++++++++++++++-- fs/fuse/readdir.c | 2 +- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f2a011ab1b311c..2efd4131f0c763 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -255,7 +255,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto invalid; forget_all_cached_acls(inode); - fuse_change_attributes(inode, &outarg.attr, + fuse_change_attributes(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); @@ -1213,8 +1213,8 @@ static int fuse_do_statx(struct inode *inode, struct file *file, fuse_statx_to_attr(&outarg.stat, &attr); if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) { - fuse_change_attributes(inode, &attr, ATTR_TIMEOUT(&outarg), - attr_version); + fuse_change_attributes(inode, &attr, &outarg.stat, + ATTR_TIMEOUT(&outarg), attr_version); } stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; @@ -1261,7 +1261,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, fuse_make_bad(inode); err = -EIO; } else { - fuse_change_attributes(inode, &outarg.attr, + fuse_change_attributes(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), attr_version); if (stat) @@ -1316,6 +1316,10 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, generic_fillattr(&init_user_ns, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + if (test_bit(FUSE_I_BTIME, &fi->state)) { + stat->btime = fi->i_btime; + stat->result_mask |= STATX_BTIME; + } } return err; @@ -1952,7 +1956,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* FIXME: clear I_DIRTY_SYNC? */ } - fuse_change_attributes_common(inode, &outarg.attr, + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode)); oldsize = inode->i_size; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c528a15d84ed41..0fb8ffa54fe46e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -88,6 +88,9 @@ struct fuse_inode { preserve the original mode */ umode_t orig_i_mode; + /* Cache birthtime */ + struct timespec64 i_btime; + /** 64 bit inode number */ u64 orig_ino; @@ -167,6 +170,8 @@ enum { FUSE_I_SIZE_UNSTABLE, /* Bad inode */ FUSE_I_BAD, + /* Has btime */ + FUSE_I_BTIME, }; struct fuse_conn; @@ -1064,9 +1069,11 @@ void fuse_init_symlink(struct inode *inode); * Change attributes of an inode */ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u32 cache_mask); u32 fuse_get_cache_mask(struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7edd4ca86f59a8..4f0c0272533ab1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -164,6 +164,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u32 cache_mask) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -199,6 +200,25 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ctime.tv_sec = attr->ctime; inode->i_ctime.tv_nsec = attr->ctimensec; } + if (sx) { + /* Sanitize nsecs */ + sx->btime.tv_nsec = + min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + + /* + * Btime has been queried, cache is valid (whether or not btime + * is available or not) so clear STATX_BTIME from inval_mask. + * + * Availability of the btime attribute is indicated in + * FUSE_I_BTIME + */ + set_mask_bits(&fi->inval_mask, STATX_BTIME, 0); + if (sx->mask & STATX_BTIME) { + set_bit(FUSE_I_BTIME, &fi->state); + fi->i_btime.tv_sec = sx->btime.tv_sec; + fi->i_btime.tv_nsec = sx->btime.tv_nsec; + } + } if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -238,6 +258,7 @@ u32 fuse_get_cache_mask(struct inode *inode) } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u64 attr_version) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -272,7 +293,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); oldsize = inode->i_size; /* @@ -403,7 +424,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); - fuse_change_attributes(inode, attr, attr_valid, attr_version); + fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); return inode; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index c58447be5e4dc8..9e6d587b3e6731 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -223,7 +223,7 @@ static int fuse_direntplus_link(struct file *file, spin_unlock(&fi->lock); forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, + fuse_change_attributes(inode, &o->attr, NULL, ATTR_TIMEOUT(o), attr_version); /* From e697c067cce8a68bf051f8e870ddaaac4b3dfaaa Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Tue, 11 Jul 2023 12:34:02 +0800 Subject: [PATCH 068/161] fuse: invalidate dentry on EEXIST creates or ENOENT deletes The EEXIST errors returned from server are strong sign that a local negative dentry should be invalidated. Similarly, The ENOENT errors from server can also be a sign of revalidate failure. This commit invalidates dentries on EEXIST creates and ENOENT deletes by calling fuse_invalidate_entry(), which improves the consistency with no performance degradation. Signed-off-by: Jiachen Zhang Signed-off-by: Miklos Szeredi (cherry picked from commit 7d875e66859a4359acd29ce0d188e1aff048e7ed) (cherry picked from commit a39b02ea7a3b3c6fce43045ce56a25d5bb8c9216) --- fs/fuse/dir.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2efd4131f0c763..143b7f2325191c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -749,7 +749,8 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (err == -ENOSYS) { fc->no_create = 1; goto mknod; - } + } else if (err == -EEXIST) + fuse_invalidate_entry(entry); out_dput: dput(res); return err; @@ -829,6 +830,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return 0; out_put_forget_req: + if (err == -EEXIST) + fuse_invalidate_entry(entry); kfree(forget); return err; } @@ -980,7 +983,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); - } else if (err == -EINTR) + } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } @@ -1003,7 +1006,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); - } else if (err == -EINTR) + } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } @@ -1044,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) fuse_entry_unlinked(newent); - } else if (err == -EINTR) { + } else if (err == -EINTR || err == -ENOENT) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation fails (e.g. some process has CWD under the renamed From 11e77987a1c2b4913eba6278fe500b53c78a421c Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 24 Aug 2023 00:33:45 +0200 Subject: [PATCH 069/161] fuse: conditionally fill kstat in fuse_do_statx() The code path fuse_update_attributes fuse_update_get_attr fuse_do_statx has the risk to use a NULL pointer for struct kstat *stat, although current callers of fuse_update_attributes() only set request_mask to values that will trigger the call of fuse_do_getattr(), which already handles the NULL pointer. Future updates might miss that fuse_do_statx() does not handle it it is safer to add a condition already right now. Signed-off-by: Bernd Schubert Fixes: d3045530bdd2 ("fuse: implement statx") Signed-off-by: Miklos Szeredi (cherry picked from commit f73016b63b09edec8adf7e182600c52465c56ee7) (cherry picked from commit 742ea12645926491f27565e892e984dae3e7a07a) --- fs/fuse/dir.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 143b7f2325191c..b952cded40560f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1219,11 +1219,14 @@ static int fuse_do_statx(struct inode *inode, struct file *file, fuse_change_attributes(inode, &attr, &outarg.stat, ATTR_TIMEOUT(&outarg), attr_version); } - stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); - stat->btime.tv_sec = sx->btime.tv_sec; - stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); - fuse_fillattr(inode, &attr, stat); - stat->result_mask |= STATX_TYPE; + + if (stat) { + stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); + stat->btime.tv_sec = sx->btime.tv_sec; + stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + fuse_fillattr(inode, &attr, stat); + stat->result_mask |= STATX_TYPE; + } return 0; } From 10c1361116a1b41e82e7fabca0b027dc0e420173 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Sat, 30 Sep 2023 02:00:15 -0300 Subject: [PATCH 070/161] fuse: move fuse_xattr_handlers to .rodata This makes it harder for accidental or malicious changes to fuse_xattr_handlers at runtime. Cc: Miklos Szeredi Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Wedson Almeida Filho Link: https://lore.kernel.org/r/20230930050033.41174-12-wedsonaf@gmail.com Signed-off-by: Christian Brauner (cherry picked from commit 34271edb18787d0b7d0f14c505468378de7efb4d) (cherry picked from commit dd97c3374404e2ec1f53b9106b43d619a2276686) --- fs/fuse/fuse_i.h | 2 +- fs/fuse/xattr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0fb8ffa54fe46e..6d7acbf6691362 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1284,7 +1284,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); -extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler * const fuse_xattr_handlers[]; extern const struct xattr_handler *fuse_acl_xattr_handlers[]; extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 0d3e7177fce0a7..c2311677f89318 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -230,7 +230,7 @@ static const struct xattr_handler fuse_xattr_handler = { .set = fuse_xattr_set, }; -const struct xattr_handler *fuse_xattr_handlers[] = { +const struct xattr_handler * const fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; From b7da99eb36a3a149bb1aba4dd4d3d9eaf21bd6e8 Mon Sep 17 00:00:00 2001 From: Tyler Fanelli Date: Tue, 19 Sep 2023 22:40:00 -0400 Subject: [PATCH 071/161] fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Although DIRECT_IO_RELAX's initial usage is to allow shared mmap, its description indicates a purpose of reducing memory footprint. This may imply that it could be further used to relax other DIRECT_IO operations in the future. Replace it with a flag DIRECT_IO_ALLOW_MMAP which does only one thing, allow shared mmap of DIRECT_IO files while still bypassing the cache on regular reads and writes. [Miklos] Also Keep DIRECT_IO_RELAX definition for backward compatibility. Signed-off-by: Tyler Fanelli Fixes: e78662e818f9 ("fuse: add a new fuse init flag to relax restrictions in no cache mode") Cc: # v6.6 Signed-off-by: Miklos Szeredi (cherry picked from commit c55e0a55b165202f18cbc4a20650d2e1becd5507) (cherry picked from commit 929eb008e1415317c0622b1cb30c34b2ab2eef34) --- fs/fuse/file.c | 6 +++--- fs/fuse/fuse_i.h | 4 ++-- fs/fuse/inode.c | 6 +++--- include/uapi/linux/fuse.h | 10 ++++++---- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 80537718b5f4ab..786e54d16a3602 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1470,7 +1470,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_relax) { + if (fopen_direct_io && fc->direct_io_allow_mmap) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); @@ -2488,9 +2488,9 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) if (ff->open_flags & FOPEN_DIRECT_IO) { /* Can't provide the coherency needed for MAP_SHARED - * if FUSE_DIRECT_IO_RELAX isn't set. + * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ - if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax) + if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) return -ENODEV; invalidate_inode_pages2(file->f_mapping); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6d7acbf6691362..d01c235e747baf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -797,8 +797,8 @@ struct fuse_conn { /* Is tmpfile not implemented by fs? */ unsigned int no_tmpfile:1; - /* relax restrictions in FOPEN_DIRECT_IO mode */ - unsigned int direct_io_relax:1; + /* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */ + unsigned int direct_io_allow_mmap:1; /* Is statx not implemented by fs? */ unsigned int no_statx:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4f0c0272533ab1..3d84622d62c993 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1223,8 +1223,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->init_security = 1; if (flags & FUSE_CREATE_SUPP_GROUP) fc->create_supp_group = 1; - if (flags & FUSE_DIRECT_IO_RELAX) - fc->direct_io_relax = 1; + if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) + fc->direct_io_allow_mmap = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1271,7 +1271,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index b1ec0da99588a5..ecd01865aee52b 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -203,7 +203,7 @@ * - add FUSE_HAS_EXPIRE_ONLY * * 7.39 - * - add FUSE_DIRECT_IO_RELAX + * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures */ @@ -403,8 +403,7 @@ struct fuse_file_lock { * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation - * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now - * allow shared mmap + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -443,7 +442,10 @@ struct fuse_file_lock { #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) -#define FUSE_DIRECT_IO_RELAX (1ULL << 36) +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) + +/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ +#define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP /** * CUSE INIT request/reply flags From 6c0b1a526bb02f49391db815e789159c1e333f1e Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Fri, 3 Nov 2023 10:39:47 -0700 Subject: [PATCH 072/161] fuse: share lookup state between submount and its parent Fuse submounts do not perform a lookup for the nodeid that they inherit from their parent. Instead, the code decrements the nlookup on the submount's fuse_inode when it is instantiated, and no forget is performed when a submount root is evicted. Trouble arises when the submount's parent is evicted despite the submount itself being in use. In this author's case, the submount was in a container and deatched from the initial mount namespace via a MNT_DEATCH operation. When memory pressure triggered the shrinker, the inode from the parent was evicted, which triggered enough forgets to render the submount's nodeid invalid. Since submounts should still function, even if their parent goes away, solve this problem by sharing refcounted state between the parent and its submount. When all of the references on this shared state reach zero, it's safe to forget the final lookup of the fuse nodeid. Signed-off-by: Krister Johansen Cc: stable@vger.kernel.org Fixes: 1866d779d5d2 ("fuse: Allow fuse_fill_super_common() for submounts") Signed-off-by: Miklos Szeredi (cherry picked from commit c4d361f66ac91db8fc65061a9671682f61f4ca9d) (cherry picked from commit e84f59f507f2b6c62aebc3bb2feccdcdd3d270ab) --- fs/fuse/fuse_i.h | 15 ++++++++++ fs/fuse/inode.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d01c235e747baf..da9db8a35d1cf0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -63,6 +63,19 @@ struct fuse_forget_link { struct fuse_forget_link *next; }; +/* Submount lookup tracking */ +struct fuse_submount_lookup { + /** Refcount */ + refcount_t count; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -158,6 +171,8 @@ struct fuse_inode { */ struct fuse_inode_dax *dax; #endif + /** Submount specific lookup tracking */ + struct fuse_submount_lookup *submount_lookup; }; /** FUSE inode state bits */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3d84622d62c993..a56286e529256a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -69,6 +69,24 @@ struct fuse_forget_link *fuse_alloc_forget(void) return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } +static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void) +{ + struct fuse_submount_lookup *sl; + + sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT); + if (!sl) + return NULL; + sl->forget = fuse_alloc_forget(); + if (!sl->forget) + goto out_free; + + return sl; + +out_free: + kfree(sl); + return NULL; +} + static struct inode *fuse_alloc_inode(struct super_block *sb) { struct fuse_inode *fi; @@ -84,6 +102,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->attr_version = 0; fi->orig_ino = 0; fi->state = 0; + fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); @@ -114,6 +133,17 @@ static void fuse_free_inode(struct inode *inode) kmem_cache_free(fuse_inode_cachep, fi); } +static void fuse_cleanup_submount_lookup(struct fuse_conn *fc, + struct fuse_submount_lookup *sl) +{ + if (!refcount_dec_and_test(&sl->count)) + return; + + fuse_queue_forget(fc, sl->forget, sl->nodeid, 1); + sl->forget = NULL; + kfree(sl); +} + static void fuse_evict_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -133,6 +163,11 @@ static void fuse_evict_inode(struct inode *inode) fi->nlookup); fi->forget = NULL; } + + if (fi->submount_lookup) { + fuse_cleanup_submount_lookup(fc, fi->submount_lookup); + fi->submount_lookup = NULL; + } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); @@ -334,6 +369,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, + u64 nodeid) +{ + sl->nodeid = nodeid; + refcount_set(&sl->count, 1); +} + static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) { inode->i_mode = attr->mode & S_IFMT; @@ -391,12 +433,22 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, */ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && S_ISDIR(attr->mode)) { + struct fuse_inode *fi; + inode = new_inode(sb); if (!inode) return NULL; fuse_init_inode(inode, attr); - get_fuse_inode(inode)->nodeid = nodeid; + fi = get_fuse_inode(inode); + fi->nodeid = nodeid; + fi->submount_lookup = fuse_alloc_submount_lookup(); + if (!fi->submount_lookup) { + iput(inode); + return NULL; + } + /* Sets nlookup = 1 on fi->submount_lookup->nlookup */ + fuse_init_submount_lookup(fi->submount_lookup, nodeid); inode->i_flags |= S_AUTOMOUNT; goto done; } @@ -419,11 +471,11 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, iput(inode); goto retry; } -done: fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); +done: fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); return inode; @@ -1461,6 +1513,8 @@ static int fuse_fill_super_submount(struct super_block *sb, struct super_block *parent_sb = parent_fi->inode.i_sb; struct fuse_attr root_attr; struct inode *root; + struct fuse_submount_lookup *sl; + struct fuse_inode *fi; fuse_sb_defaults(sb); fm->sb = sb; @@ -1483,12 +1537,27 @@ static int fuse_fill_super_submount(struct super_block *sb, * its nlookup should not be incremented. fuse_iget() does * that, though, so undo it here. */ - get_fuse_inode(root)->nlookup--; + fi = get_fuse_inode(root); + fi->nlookup--; + sb->s_d_op = &fuse_dentry_operations; sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; + /* + * Grab the parent's submount_lookup pointer and take a + * reference on the shared nlookup from the parent. This is to + * prevent the last forget for this nodeid from getting + * triggered until all users have finished with it. + */ + sl = parent_fi->submount_lookup; + WARN_ON(!sl); + if (sl) { + refcount_inc(&sl->count); + fi->submount_lookup = sl; + } + return 0; } From 65ed6942383ede03b30a52105e45ac64ff30c399 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 16 Nov 2023 15:57:26 +0800 Subject: [PATCH 073/161] fuse: dax: set fc->dax to NULL in fuse_dax_conn_free() fuse_dax_conn_free() will be called when fuse_fill_super_common() fails after fuse_dax_conn_alloc(). Then deactivate_locked_super() in virtio_fs_get_tree() will call virtio_kill_sb() to release the discarded superblock. This will call fuse_dax_conn_free() again in fuse_conn_put(), resulting in a possible double free. Fixes: 1dd539577c42 ("virtiofs: add a mount option to enable dax") Signed-off-by: Hangyu Hua Acked-by: Vivek Goyal Reviewed-by: Jingbo Xu Cc: # v5.10 Signed-off-by: Miklos Szeredi (cherry picked from commit 7f8ed28d1401320bcb02dda81b3c23ab2dc5a6d8) (cherry picked from commit 51ec4d843c06d3e4c11d5d8c9e93a00c7cdc489b) --- fs/fuse/dax.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e23e802a80130a..6e71904c396f18 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1224,6 +1224,7 @@ void fuse_dax_conn_free(struct fuse_conn *fc) if (fc->dax) { fuse_free_dax_mem_ranges(&fc->dax->free_ranges); kfree(fc->dax); + fc->dax = NULL; } } From e78c1dc649b87e5ed411779e50e9b26f6c35067c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 3 Dec 2023 09:42:33 +0200 Subject: [PATCH 074/161] fuse: disable FOPEN_PARALLEL_DIRECT_WRITES with FUSE_DIRECT_IO_ALLOW_MMAP The new fuse init flag FUSE_DIRECT_IO_ALLOW_MMAP breaks assumptions made by FOPEN_PARALLEL_DIRECT_WRITES and causes test generic/095 to hit BUG_ON(fi->writectr < 0) assertions in fuse_set_nowrite(): generic/095 5s ... kernel BUG at fs/fuse/dir.c:1756! ... ? fuse_set_nowrite+0x3d/0xdd ? do_raw_spin_unlock+0x88/0x8f ? _raw_spin_unlock+0x2d/0x43 ? fuse_range_is_writeback+0x71/0x84 fuse_sync_writes+0xf/0x19 fuse_direct_io+0x167/0x5bd fuse_direct_write_iter+0xf0/0x146 Auto disable FOPEN_PARALLEL_DIRECT_WRITES when server negotiated FUSE_DIRECT_IO_ALLOW_MMAP. Fixes: e78662e818f9 ("fuse: add a new fuse init flag to relax restrictions in no cache mode") Cc: # v6.6 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit 3f29f1c336c0e8a4bec52f1e5217f88835553e5b) (cherry picked from commit 1d9e985b64a19a1af20dc75f87df461ec7f6afef) --- fs/fuse/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 786e54d16a3602..607dcecb1f86ea 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1596,6 +1596,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t res; bool exclusive_lock = !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + get_fuse_conn(inode)->direct_io_allow_mmap || iocb->ki_flags & IOCB_APPEND || fuse_direct_write_extending_i_size(iocb, from); @@ -1603,6 +1604,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) * Take exclusive lock if * - Parallel direct writes are disabled - a user space decision * - Parallel direct writes are enabled and i_size is being extended. + * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). * This might not be needed at all, but needs further investigation. */ if (exclusive_lock) From 37206fc09303b3b45f71062737ef9e893c432253 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 28 Sep 2023 00:19:39 -0400 Subject: [PATCH 075/161] fuse: fix UAF in rcu pathwalks ->permission(), ->get_link() and ->inode_get_acl() might dereference ->s_fs_info (and, in case of ->permission(), ->s_fs_info->fc->user_ns as well) when called from rcu pathwalk. Freeing ->s_fs_info->fc is rcu-delayed; we need to make freeing ->s_fs_info and dropping ->user_ns rcu-delayed too. Signed-off-by: Al Viro (cherry picked from commit 053fc4f755ad43cf35210677bcba798ccdc48d0c) (cherry picked from commit dbea54e6deddd24880360d28a84059aaf8b16e49) --- fs/fuse/cuse.c | 3 +-- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 15 +++++++++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 91e89e68177ee4..b6cad106c37e44 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -474,8 +474,7 @@ static int cuse_send_init(struct cuse_conn *cc) static void cuse_fc_release(struct fuse_conn *fc) { - struct cuse_conn *cc = fc_to_cc(fc); - kfree_rcu(cc, fc.rcu); + kfree(fc_to_cc(fc)); } /** diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index da9db8a35d1cf0..33204ae49daf66 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -888,6 +888,7 @@ struct fuse_mount { /* Entry on fc->mounts */ struct list_head fc_entry; + struct rcu_head rcu; }; static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a56286e529256a..980815ddca31dd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -923,6 +923,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, } EXPORT_SYMBOL_GPL(fuse_conn_init); +static void delayed_release(struct rcu_head *p) +{ + struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + + put_user_ns(fc->user_ns); + fc->release(fc); +} + void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { @@ -934,13 +942,12 @@ void fuse_conn_put(struct fuse_conn *fc) if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); - put_user_ns(fc->user_ns); bucket = rcu_dereference_protected(fc->curr_bucket, 1); if (bucket) { WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } - fc->release(fc); + call_rcu(&fc->rcu, delayed_release); } } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -1359,7 +1366,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); - kfree_rcu(fc, rcu); + kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); @@ -1898,7 +1905,7 @@ static void fuse_sb_destroy(struct super_block *sb) void fuse_mount_destroy(struct fuse_mount *fm) { fuse_conn_put(fm->fc); - kfree(fm); + kfree_rcu(fm, rcu); } EXPORT_SYMBOL(fuse_mount_destroy); From 3f4b9cc86627bfc8a36081e6c7785709118697df Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 12 Dec 2023 14:33:23 +0100 Subject: [PATCH 076/161] fuse: fix VM_MAYSHARE and direct_io_allow_mmap BugLink: https://bugs.launchpad.net/bugs/2060531 [ Upstream commit 9511176bbaee0ac60ecc84e7b01cf5972a59ea17 ] There were multiple issues with direct_io_allow_mmap: - fuse_link_write_file() was missing, resulting in warnings in fuse_write_file_get() and EIO from msync() - "vma->vm_ops = &fuse_file_vm_ops" was not set, but especially fuse_page_mkwrite is needed. The semantics of invalidate_inode_pages2() is so far not clearly defined in fuse_file_mmap. It dates back to commit 3121bfe76311 ("fuse: fix "direct_io" private mmap") Though, as direct_io_allow_mmap is a new feature, that was for MAP_PRIVATE only. As invalidate_inode_pages2() is calling into fuse_launder_folio() and writes out dirty pages, it should be safe to call invalidate_inode_pages2 for MAP_PRIVATE and MAP_SHARED as well. Cc: Hao Xu Cc: stable@vger.kernel.org Fixes: e78662e818f9 ("fuse: add a new fuse init flag to relax restrictions in no cache mode") Signed-off-by: Bernd Schubert Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin Signed-off-by: Paolo Pisati (cherry picked from commit d63daa6f44c6ba86e9c3be103eef605b98627f3d) (cherry picked from commit cd10b03c97e5e005a7bf8ee0a15fca7070e48f18) --- fs/fuse/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 607dcecb1f86ea..f087f1b07311f2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2489,7 +2489,8 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return fuse_dax_mmap(file, vma); if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED + /* + * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) @@ -2497,7 +2498,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) invalidate_inode_pages2(file->f_mapping); - return generic_file_mmap(file, vma); + if (!(vma->vm_flags & VM_MAYSHARE)) { + /* MAP_PRIVATE */ + return generic_file_mmap(file, vma); + } } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) From a3d6ee4abaf61d6b56bc3f80f54a360df496a9ac Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 077/161] fuse: replace remaining make_bad_inode() with fuse_make_bad() BugLink: https://bugs.launchpad.net/bugs/2060531 [ Upstream commit 82e081aebe4d9c26e196c8260005cc4762b57a5d ] fuse_do_statx() was added with the wrong helper. Fixes: d3045530bdd2 ("fuse: implement statx") Cc: # v6.6 Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin Signed-off-by: Paolo Pisati (cherry picked from commit 068b9a5d382dcbb686a690a8f327b7715baca0ed) (cherry picked from commit 03df28c4f41e9bf5e4dbfbd9dc8763450ddc1d22) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b952cded40560f..9bff12e245372d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1210,7 +1210,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file, if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || inode_wrong_type(inode, sx->mode)))) { - make_bad_inode(inode); + fuse_make_bad(inode); return -EIO; } From 0ed0654e43168002c03e9b1489f9be1a0b7cf189 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 078/161] fuse: fix root lookup with nonzero generation BugLink: https://bugs.launchpad.net/bugs/2060531 [ Upstream commit 68ca1b49e430f6534d0774a94147a823e3b8b26e ] The root inode has a fixed nodeid and generation (1, 0). Prior to the commit 15db16837a35 ("fuse: fix illegal access to inode with reused nodeid") generation number on lookup was ignored. After this commit lookup with the wrong generation number resulted in the inode being unhashed. This is correct for non-root inodes, but replacing the root inode is wrong and results in weird behavior. Fix by reverting to the old behavior if ignoring the generation for the root inode, but issuing a warning in dmesg. Reported-by: Antonio SJ Musumeci Closes: https://lore.kernel.org/all/CAOQ4uxhek5ytdN8Yz2tNEOg5ea4NkBb4nk0FGPjPk_9nz-VG3g@mail.gmail.com/ Fixes: 15db16837a35 ("fuse: fix illegal access to inode with reused nodeid") Cc: # v5.14 Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin Signed-off-by: Paolo Pisati (cherry picked from commit 24664d2e6bc98fd491b1e17e1fa85fbc5b406abb) (cherry picked from commit 9eb11615e0a14fceae5a382c22892d6c58ad0502) --- fs/fuse/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 9bff12e245372d..d89728c326b615 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; + if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) { + pr_warn_once("root generation should be zero\n"); + outarg->generation = 0; + } *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), From 7d6dea83ef231a1c4811390bdd655a41f4075f64 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 079/161] fuse: don't unhash root BugLink: https://bugs.launchpad.net/bugs/2060531 [ Upstream commit b1fe686a765e6c0d71811d825b5a1585a202b777 ] The root inode is assumed to be always hashed. Do not unhash the root inode even if it is marked BAD. Fixes: 5d069dbe8aaf ("fuse: fix bad inode") Cc: # v5.11 Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin Signed-off-by: Paolo Pisati (cherry picked from commit 2ae27d177bbd7ffc7df6fdce1bd281f2e857b9d4) (cherry picked from commit 82bbe5370ab89f0cafd8c0e78dbd42001a0a5c75) --- fs/fuse/fuse_i.h | 1 - fs/fuse/inode.c | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 33204ae49daf66..37bad8b0c73981 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -940,7 +940,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation, static inline void fuse_make_bad(struct inode *inode) { - remove_inode_hash(inode); set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 980815ddca31dd..19f75568dc8087 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -468,8 +468,11 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); - iput(inode); - goto retry; + if (inode != d_inode(sb->s_root)) { + remove_inode_hash(inode); + iput(inode); + goto retry; + } } fi = get_fuse_inode(inode); spin_lock(&fi->lock); From 9131aa524977fa5b998ff1e59d90b5c4a8f374d8 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 13 Apr 2024 17:34:31 -0700 Subject: [PATCH 080/161] fuse: fix leaked ENOSYS error on first statx call BugLink: https://bugs.launchpad.net/bugs/2068087 commit eb4b691b9115fae4c844f5941418335575cf667f upstream. FUSE attempts to detect server support for statx by trying it once and setting no_statx=1 if it fails with ENOSYS, but consider the following scenario: - Userspace (e.g. sh) calls stat() on a file * succeeds - Userspace (e.g. lsd) calls statx(BTIME) on the same file - request_mask = STATX_BASIC_STATS | STATX_BTIME - first pass: sync=true due to differing cache_mask - statx fails and returns ENOSYS - set no_statx and retry - retry sets mask = STATX_BASIC_STATS - now mask == cache_mask; sync=false (time_before: still valid) - so we take the "else if (stat)" path - "err" is still ENOSYS from the failed statx call Fix this by zeroing "err" before retrying the failed call. Fixes: d3045530bdd2 ("fuse: implement statx") Cc: stable@vger.kernel.org # v6.6 Signed-off-by: Danny Lin Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Portia Stephens Signed-off-by: Stefan Bader (cherry picked from commit 31b31993483d29a425ccb5b529b44a07d242c5e9) (cherry picked from commit 1a024a805c5f333462e1b6864d5a60b38483c0f8) --- fs/fuse/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d89728c326b615..74179123f40a42 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1317,6 +1317,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, err = fuse_do_statx(inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; + err = 0; goto retry; } } else { From f6ed7ee90a030d4fabedc6909f43559f2ece2d21 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 2 Jul 2024 17:22:41 -0500 Subject: [PATCH 081/161] fuse: verify {g,u}id mount options correctly BugLink: https://bugs.launchpad.net/bugs/2083196 commit 525bd65aa759ec320af1dc06e114ed69733e9e23 upstream. As was done in 0200679fc795 ("tmpfs: verify {g,u}id mount options correctly") we need to validate that the requested uid and/or gid is representable in the filesystem's idmapping. Cribbing from the above commit log, The contract for {g,u}id mount options and {g,u}id values in general set from userspace has always been that they are translated according to the caller's idmapping. In so far, fuse has been doing the correct thing. But since fuse is mountable in unprivileged contexts it is also necessary to verify that the resulting {k,g}uid is representable in the namespace of the superblock. Fixes: c30da2e981a7 ("fuse: convert to use the new mount API") Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/8f07d45d-c806-484d-a2e3-7a2199df1cd2@redhat.com Reviewed-by: Christian Brauner Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner Signed-off-by: Greg Kroah-Hartman Signed-off-by: Portia Stephens Signed-off-by: Roxana Nicolescu (cherry picked from commit 4bbcf6923a268014f3b4085f1b8794918dad866c) (cherry picked from commit db84bc2ba32890e9dada9ab71331665f17df91a2) --- fs/fuse/inode.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 19f75568dc8087..bd34ea550b27f0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -741,6 +741,8 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) struct fs_parse_result result; struct fuse_fs_context *ctx = fsc->fs_private; int opt; + kuid_t kuid; + kgid_t kgid; if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* @@ -785,16 +787,30 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) break; case OPT_USER_ID: - ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); - if (!uid_valid(ctx->user_id)) + kuid = make_kuid(fsc->user_ns, result.uint_32); + if (!uid_valid(kuid)) return invalfc(fsc, "Invalid user_id"); + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fsc->user_ns, kuid)) + return invalfc(fsc, "Invalid user_id"); + ctx->user_id = kuid; ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); - if (!gid_valid(ctx->group_id)) + kgid = make_kgid(fsc->user_ns, result.uint_32);; + if (!gid_valid(kgid)) + return invalfc(fsc, "Invalid group_id"); + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fsc->user_ns, kgid)) return invalfc(fsc, "Invalid group_id"); + ctx->group_id = kgid; ctx->group_id_present = true; break; From 541cb7944e5441fabad013f164b88406bfab9f98 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:04 -0700 Subject: [PATCH 082/161] fuse: update stats for pages in dropped aux writeback list BugLink: https://bugs.launchpad.net/bugs/2085849 commit f7790d67785302b3116bbbfda62a5a44524601a3 upstream. In the case where the aux writeback list is dropped (e.g. the pages have been truncated or the connection is broken), the stats for its pages and backing device info need to be updated as well. Fixes: e2653bd53a98 ("fuse: fix leaked aux requests") Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Cc: # v5.1 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Manuel Diewald Signed-off-by: Roxana Nicolescu (cherry picked from commit 38d54e5c3f52ecfe52c3108fd5605ef691380c18) (cherry picked from commit 5937aa71ec01b487f393968faacc668c73193a49) --- fs/fuse/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f087f1b07311f2..d8380a2948cd50 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1757,10 +1757,16 @@ __acquires(fi->lock) fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); - /* After fuse_writepage_finish() aux request list is private */ + /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { + struct backing_dev_info *bdi = inode_to_bdi(aux->inode); + next = aux->next; aux->next = NULL; + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(aux); } From 29c1c734cc227d099772bb41861084e2b99f2ee0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 19 Aug 2024 19:52:30 +0200 Subject: [PATCH 083/161] fuse: use unsigned type for getxattr/listxattr size truncation BugLink: https://bugs.launchpad.net/bugs/2085849 commit b18915248a15eae7d901262f108d6ff0ffb4ffc1 upstream. The existing code uses min_t(ssize_t, outarg.size, XATTR_LIST_MAX) when parsing the FUSE daemon's response to a zero-length getxattr/listxattr request. On 32-bit kernels, where ssize_t and outarg.size are the same size, this is wrong: The min_t() will pass through any size values that are negative when interpreted as signed. fuse_listxattr() will then return this userspace-supplied negative value, which callers will treat as an error value. This kind of bug pattern can lead to fairly bad security bugs because of how error codes are used in the Linux kernel. If a caller were to convert the numeric error into an error pointer, like so: struct foo *func(...) { int len = fuse_getxattr(..., NULL, 0); if (len < 0) return ERR_PTR(len); ... } then it would end up returning this userspace-supplied negative value cast to a pointer - but the caller of this function wouldn't recognize it as an error pointer (IS_ERR_VALUE() only detects values in the narrow range in which legitimate errno values are), and so it would just be treated as a kernel pointer. I think there is at least one theoretical codepath where this could happen, but that path would involve virtio-fs with submounts plus some weird SELinux configuration, so I think it's probably not a concern in practice. Cc: stable@vger.kernel.org # v4.9 Fixes: 63401ccdb2ca ("fuse: limit xattr returned size") Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Manuel Diewald Signed-off-by: Roxana Nicolescu (cherry picked from commit 3f308f5ac29fb3340f24ec6639bb3c597a2900ee) (cherry picked from commit 78f1456d0178060b17183b9b2133750fc2fee10e) --- fs/fuse/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index c2311677f89318..899eda6d546387 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; @@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { From 2be22a94484504ebc787de28d4bec82775826402 Mon Sep 17 00:00:00 2001 From: yangyun Date: Fri, 23 Aug 2024 16:51:46 +0800 Subject: [PATCH 084/161] fuse: fix memory leak in fuse_create_open BugLink: https://bugs.launchpad.net/bugs/2085849 commit 3002240d16494d798add0575e8ba1f284258ab34 upstream. The memory of struct fuse_file is allocated but not freed when get_create_ext return error. Fixes: 3e2b6fdbdc9a ("fuse: send security context of inode on file") Cc: stable@vger.kernel.org # v5.17 Signed-off-by: yangyun Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Manuel Diewald Signed-off-by: Roxana Nicolescu (cherry picked from commit 4fad36d0bec45a4a167bcff1f924c1f0d935b5bd) (cherry picked from commit d3e4a77e4bbc82583fd541e649ba8b989b05fbd6) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 74179123f40a42..8cad9b19091243 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -668,7 +668,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, err = get_create_ext(&args, dir, entry, mode); if (err) - goto out_put_forget_req; + goto out_free_ff; err = fuse_simple_request(fm, &args); free_ext_value(&args); From 5ae9180bc98afd42abe42f65654a0e7e1bf4f472 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Aug 2024 15:55:17 +0200 Subject: [PATCH 085/161] fuse: clear PG_uptodate when using a stolen page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BugLink: https://bugs.launchpad.net/bugs/2085849 commit 76a51ac00ca2a72fe3e168b7fb0e70f75ba6f512 upstream. Originally when a stolen page was inserted into fuse's page cache by fuse_try_move_page(), it would be marked uptodate. Then fuse_readpages_end() would call SetPageUptodate() again on the already uptodate page. Commit 413e8f014c8b ("fuse: Convert fuse_readpages_end() to use folio_end_read()") changed that by replacing the SetPageUptodate() + unlock_page() combination with folio_end_read(), which does mostly the same, except it sets the uptodate flag with an xor operation, which in the above scenario resulted in the uptodate flag being cleared, which in turn resulted in EIO being returned on the read. Fix by clearing PG_uptodate instead of setting it in fuse_try_move_page(), conforming to the expectation of folio_end_read(). Reported-by: Jürg Billeter Debugged-by: Matthew Wilcox Fixes: 413e8f014c8b ("fuse: Convert fuse_readpages_end() to use folio_end_read()") Cc: # v6.10 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Manuel Diewald Signed-off-by: Roxana Nicolescu (cherry picked from commit 0dcf2ad6c2c4e7d8cde48aa862ac6e8bb56ebefe) (cherry picked from commit 70ab16aa42759fdc30738ff5cfb21233813ca208) --- fs/fuse/dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8573d79ef29c80..06f0912c836a18 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -773,7 +773,6 @@ static int fuse_check_folio(struct folio *folio) (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | - 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | @@ -818,9 +817,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newfolio = page_folio(buf->page); - if (!folio_test_uptodate(newfolio)) - folio_mark_uptodate(newfolio); - + folio_clear_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) From d0bde2b99434adf063b407ed705daa6d6d1eb892 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 11 Mar 2025 08:51:19 +0900 Subject: [PATCH 086/161] virtiofs: use pages instead of pointer for kernel direct IO BugLink: https://bugs.launchpad.net/bugs/2101915 [ Upstream commit 41748675c0bf252b3c5f600a95830f0936d366c1 ] When trying to insert a 10MB kernel module kept in a virtio-fs with cache disabled, the following warning was reported: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 404 at mm/page_alloc.c:4551 ...... Modules linked in: CPU: 1 PID: 404 Comm: insmod Not tainted 6.9.0-rc5+ #123 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:__alloc_pages+0x2bf/0x380 ...... Call Trace: ? __warn+0x8e/0x150 ? __alloc_pages+0x2bf/0x380 __kmalloc_large_node+0x86/0x160 __kmalloc+0x33c/0x480 virtio_fs_enqueue_req+0x240/0x6d0 virtio_fs_wake_pending_and_unlock+0x7f/0x190 queue_request_and_unlock+0x55/0x60 fuse_simple_request+0x152/0x2b0 fuse_direct_io+0x5d2/0x8c0 fuse_file_read_iter+0x121/0x160 __kernel_read+0x151/0x2d0 kernel_read+0x45/0x50 kernel_read_file+0x1a9/0x2a0 init_module_from_file+0x6a/0xe0 idempotent_init_module+0x175/0x230 __x64_sys_finit_module+0x5d/0xb0 x64_sys_call+0x1c3/0x9e0 do_syscall_64+0x3d/0xc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ...... ---[ end trace 0000000000000000 ]--- The warning is triggered as follows: 1) syscall finit_module() handles the module insertion and it invokes kernel_read_file() to read the content of the module first. 2) kernel_read_file() allocates a 10MB buffer by using vmalloc() and passes it to kernel_read(). kernel_read() constructs a kvec iter by using iov_iter_kvec() and passes it to fuse_file_read_iter(). 3) virtio-fs disables the cache, so fuse_file_read_iter() invokes fuse_direct_io(). As for now, the maximal read size for kvec iter is only limited by fc->max_read. For virtio-fs, max_read is UINT_MAX, so fuse_direct_io() doesn't split the 10MB buffer. It saves the address and the size of the 10MB-sized buffer in out_args[0] of a fuse request and passes the fuse request to virtio_fs_wake_pending_and_unlock(). 4) virtio_fs_wake_pending_and_unlock() uses virtio_fs_enqueue_req() to queue the request. Because virtiofs need DMA-able address, so virtio_fs_enqueue_req() uses kmalloc() to allocate a bounce buffer for all fuse args, copies these args into the bounce buffer and passed the physical address of the bounce buffer to virtiofsd. The total length of these fuse args for the passed fuse request is about 10MB, so copy_args_to_argbuf() invokes kmalloc() with a 10MB size parameter and it triggers the warning in __alloc_pages(): if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp)) return NULL; 5) virtio_fs_enqueue_req() will retry the memory allocation in a kworker, but it won't help, because kmalloc() will always return NULL due to the abnormal size and finit_module() will hang forever. A feasible solution is to limit the value of max_read for virtio-fs, so the length passed to kmalloc() will be limited. However it will affect the maximal read size for normal read. And for virtio-fs write initiated from kernel, it has the similar problem but now there is no way to limit fc->max_write in kernel. So instead of limiting both the values of max_read and max_write in kernel, introducing use_pages_for_kvec_io in fuse_conn and setting it as true in virtiofs. When use_pages_for_kvec_io is enabled, fuse will use pages instead of pointer to pass the KVEC_IO data. After switching to pages for KVEC_IO data, these pages will be used for DMA through virtio-fs. If these pages are backed by vmalloc(), {flush|invalidate}_kernel_vmap_range() are necessary to flush or invalidate the cache before the DMA operation. So add two new fields in fuse_args_pages to record the base address of vmalloc area and the condition indicating whether invalidation is needed. Perform the flush in fuse_get_user_pages() for write operations and the invalidation in fuse_release_user_pages() for read operations. It may seem necessary to introduce another field in fuse_conn to indicate that these KVEC_IO pages are used for DMA, However, considering that virtio-fs is currently the only user of use_pages_for_kvec_io, just reuse use_pages_for_kvec_io to indicate that these pages will be used for DMA. Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") Signed-off-by: Hou Tao Tested-by: Jingbo Xu Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin [koichiroden: adjusted context due to missing commits: 738adade96b2 ("fuse: Fix missing FOLL_PIN for direct-io") 7dc4e97a4f9a ("fuse: introduce FUSE_PASSTHROUGH capability")] CVE-2024-53219 Signed-off-by: Koichiro Den Signed-off-by: Stefan Bader (cherry picked from commit 6996dac8834951514416624e885cb4150d67df07) (cherry picked from commit a98bbb2654b92e413e2bb1c98dc21764b7548030) --- fs/fuse/file.c | 62 +++++++++++++++++++++++++++++++-------------- fs/fuse/fuse_i.h | 6 +++++ fs/fuse/virtio_fs.c | 1 + 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d8380a2948cd50..d32ca529f8460c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -624,7 +624,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; @@ -634,6 +634,9 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, set_page_dirty_lock(ap->pages[i]); put_page(ap->pages[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -732,25 +735,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -1390,24 +1397,37 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } } while (nbytes < *nbytesp && ap->num_pages < max_pages) { @@ -1433,6 +1453,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + + ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.user_pages = true; if (write) ap->args.in_pages = true; @@ -1500,7 +1524,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1514,7 +1538,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 37bad8b0c73981..16eb130b623ead 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -283,9 +283,12 @@ struct fuse_args { bool page_replace:1; bool may_block:1; bool is_ext:1; + bool invalidate_vmap:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + /* Used for kvec iter backed by vmalloc address */ + void *vmap_base; }; struct fuse_args_pages { @@ -818,6 +821,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* Use pages instead of pointer for kernel I/O */ + unsigned int use_pages_for_kvec_io:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4d8d4f16c727b9..f7eae090581a50 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1447,6 +1447,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; + fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, From df70a52576c40d0f806514f11842b7b02bf670b6 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:28:54 +0100 Subject: [PATCH 087/161] fuse: rename to fuse_dev_end_requests and make non-static This function is needed by fuse_uring.c to clean ring queues, so make it non static. Especially in non-static mode the function name 'end_requests' should be prefixed with fuse_ Signed-off-by: Bernd Schubert Reviewed-by: Josef Bacik Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 92270d076115ae81b83f0605703602d2d5a866e2) (cherry picked from commit 3bcf5ee25a29776810a942f07a2c94a56f22acb3) (cherry picked from commit 8c0ad2e62cefebfccbd8fbe60ada6a1041ae73e5) --- fs/fuse/dev.c | 7 ++++--- fs/fuse/fuse_dev_i.h | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 fs/fuse/fuse_dev_i.h diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 06f0912c836a18..8f1e607f9aa7da 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include #include @@ -2076,7 +2077,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct list_head *head) +void fuse_dev_end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2179,7 +2180,7 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); } else { spin_unlock(&fc->lock); } @@ -2209,7 +2210,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h new file mode 100644 index 00000000000000..4fcff2223fa60f --- /dev/null +++ b/fs/fuse/fuse_dev_i.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2008 Miklos Szeredi + */ +#ifndef _FS_FUSE_DEV_I_H +#define _FS_FUSE_DEV_I_H + +#include + +void fuse_dev_end_requests(struct list_head *head); + +#endif + From 9516cda7036ee275cd154ae2e487fc81f9d0d293 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:28:55 +0100 Subject: [PATCH 088/161] fuse: Move fuse_get_dev to header file Another preparation patch, as this function will be needed by fuse/dev.c and fuse/dev_uring.c. Signed-off-by: Bernd Schubert Reviewed-by: Josef Bacik Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 867d93dcdede5ea453543085d1ed0bf43cde60de) (cherry picked from commit 1666101a984386f2409e926fd81ac4839e799d64) (cherry picked from commit f0ca697bc9379f90d1643f057b46e595e5717087) --- fs/fuse/dev.c | 9 --------- fs/fuse/fuse_dev_i.h | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8f1e607f9aa7da..6741addbcfcc6a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -32,15 +32,6 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; -static struct fuse_dev *fuse_get_dev(struct file *file) -{ - /* - * Lockless access is OK, because file->private data is set - * once during mount and is valid until the file is released. - */ - return READ_ONCE(file->private_data); -} - static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 4fcff2223fa60f..e7ea1b21c18204 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -8,6 +8,15 @@ #include +static inline struct fuse_dev *fuse_get_dev(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return READ_ONCE(file->private_data); +} + void fuse_dev_end_requests(struct list_head *head); #endif From ac61182a0fdddfdb20736138ddf569159998dff2 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:28:56 +0100 Subject: [PATCH 089/161] fuse: Move request bits These are needed by fuse-over-io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Josef Bacik Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 88be7aa98d91f5761f8764273ebd961915e4632d) (cherry picked from commit 6b6c21713c8a1402321fc526815a80cbdbe82db7) (cherry picked from commit 0b74b90f1de48dd084dd4b6c080c8ba960073645) --- fs/fuse/dev.c | 4 ---- fs/fuse/fuse_dev_i.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6741addbcfcc6a..a9b44de8f03ebf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -26,10 +26,6 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); -/* Ordinary requests have even IDs, while interrupts IDs are odd */ -#define FUSE_INT_REQ_BIT (1ULL << 0) -#define FUSE_REQ_ID_STEP (1ULL << 1) - static struct kmem_cache *fuse_req_cachep; static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index e7ea1b21c18204..08a7e88e002773 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -8,6 +8,10 @@ #include +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + static inline struct fuse_dev *fuse_get_dev(struct file *file) { /* From 3c6364bfff95807d50bff5e4b30f72c8d70442ef Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:28:58 +0100 Subject: [PATCH 090/161] fuse: make args->in_args[0] to be always the header This change sets up FUSE operations to always have headers in args.in_args[0], even for opcodes without an actual header. This step prepares for a clean separation of payload from headers, initially it is used by fuse-over-io-uring. For opcodes without a header, we use a zero-sized struct as a placeholder. This approach: - Keeps things consistent across all FUSE operations - Will help with payload alignment later - Avoids future issues when header sizes change Op codes that already have an op code specific header do not need modification. Op codes that have neither payload nor op code headers are not modified either (FUSE_READLINK and FUSE_DESTROY). FUSE_BATCH_FORGET already has the header in the right place, but is not using fuse_copy_args - as -over-uring is currently not handling forgets it does not matter for now, but header separation will later need special attention for that op code. Correct the struct fuse_args->in_args array max size. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 7ccd86ba3a485a8bc33478776eb7053d9adb7816) (cherry picked from commit d31f04ec17bb8deb84f1d361441e5886feb28e85) (cherry picked from commit b73e07c6018497bc4918c09e2b80e2531042fa6e) --- fs/fuse/dax.c | 11 ++++++----- fs/fuse/dev.c | 9 +++++---- fs/fuse/dir.c | 32 ++++++++++++++++++-------------- fs/fuse/fuse_i.h | 15 ++++++++++++++- fs/fuse/xattr.c | 7 ++++--- 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 6e71904c396f18..20264a6b7df3f5 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode, args.opcode = FUSE_REMOVEMAPPING; args.nodeid = fi->nodeid; - args.in_numargs = 2; - args.in_args[0].size = sizeof(*inargp); - args.in_args[0].value = inargp; - args.in_args[1].size = inargp->count * sizeof(*remove_one); - args.in_args[1].value = remove_one; + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = sizeof(*inargp); + args.in_args[1].value = inargp; + args.in_args[2].size = inargp->count * sizeof(*remove_one); + args.in_args[2].value = remove_one; return fuse_simple_request(fm, &args); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a9b44de8f03ebf..99afdfd63b0119 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1687,7 +1687,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, args = &ap->args; args->nodeid = outarg->nodeid; args->opcode = FUSE_NOTIFY_REPLY; - args->in_numargs = 2; + args->in_numargs = 3; args->in_pages = true; args->end = fuse_retrieve_end; @@ -1714,9 +1714,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; - args->in_args[0].size = sizeof(ra->inarg); - args->in_args[0].value = &ra->inarg; - args->in_args[1].size = total_len; + fuse_set_zero_arg0(args); + args->in_args[1].size = sizeof(ra->inarg); + args->in_args[1].value = &ra->inarg; + args->in_args[2].size = total_len; err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8cad9b19091243..108da66d67c977 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -175,9 +175,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; - args->in_numargs = 1; - args->in_args[0].size = name->len + 1; - args->in_args[0].value = name->name; + args->in_numargs = 2; + fuse_set_zero_arg0(args); + args->in_args[1].size = name->len + 1; + args->in_args[1].value = name->name; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; @@ -916,11 +917,12 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, FUSE_ARGS(args); args.opcode = FUSE_SYMLINK; - args.in_numargs = 2; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; - args.in_args[1].size = len; - args.in_args[1].value = link; + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.in_args[2].size = len; + args.in_args[2].value = link; return create_new_entry(fm, &args, dir, entry, S_IFLNK); } @@ -980,9 +982,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); @@ -1003,9 +1006,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 16eb130b623ead..bd56e3acf3ac6c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -284,7 +284,7 @@ struct fuse_args { bool may_block:1; bool is_ext:1; bool invalidate_vmap:1; - struct fuse_in_arg in_args[3]; + struct fuse_in_arg in_args[4]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); /* Used for kvec iter backed by vmalloc address */ @@ -897,6 +897,19 @@ struct fuse_mount { struct rcu_head rcu; }; +/* + * Empty header for FUSE opcodes without specific header needs. + * Used as a placeholder in args->in_args[0] for consistency + * across all FUSE operations, simplifying request handling. + */ +struct fuse_zero_header {}; + +static inline void fuse_set_zero_arg0(struct fuse_args *args) +{ + args->in_args[0].size = sizeof(struct fuse_zero_header); + args->in_args[0].value = NULL; +} + static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 899eda6d546387..f773373a2f7e77 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name) args.opcode = FUSE_REMOVEXATTR; args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = strlen(name) + 1; - args.in_args[0].value = name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; From 906dec4690d1de08f019c825b096c562817dffe7 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:28:59 +0100 Subject: [PATCH 091/161] fuse: {io-uring} Handle SQEs - register commands This adds basic support for ring SQEs (with opcode=IORING_OP_URING_CMD). For now only FUSE_IO_URING_CMD_REGISTER is handled to register queue entries. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 24fe962c86f55347385933a1b06ca71b60854690) (cherry picked from commit 35bc6729a56776a6c3dd9286a3200232d26fbc07) (cherry picked from commit aa79616612ff77d03bc87ff18d0a71a41bedae6f) --- fs/fuse/Kconfig | 12 ++ fs/fuse/Makefile | 1 + fs/fuse/dev_uring.c | 327 ++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 113 +++++++++++++ fs/fuse/fuse_i.h | 5 + fs/fuse/inode.c | 10 ++ include/uapi/linux/fuse.h | 74 +++++++++ 7 files changed, 542 insertions(+) create mode 100644 fs/fuse/dev_uring.c create mode 100644 fs/fuse/dev_uring_i.h diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 038ed0b9aaa5d6..5f29f6d4a08aa4 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,15 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_IO_URING + bool "FUSE communication over io-uring" + default y + depends on FUSE_FS + depends on IO_URING + help + This allows sending FUSE requests over the io-uring interface and + also adds request core affinity. + + If you want to allow fuse server/client communication through io-uring, + answer Y diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d78..ad89ff0d017580 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -9,5 +9,6 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c new file mode 100644 index 00000000000000..bff79fc60a3157 --- /dev/null +++ b/fs/fuse/dev_uring.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#include "fuse_i.h" +#include "dev_uring_i.h" +#include "fuse_dev_i.h" + +#include +#include +#include + +static bool __read_mostly enable_uring; +module_param(enable_uring, bool, 0644); +MODULE_PARM_DESC(enable_uring, + "Enable userspace communication through io-uring"); + +#define FUSE_URING_IOV_SEGS 2 /* header and payload */ + + +bool fuse_uring_enabled(void) +{ + return enable_uring; +} + +void fuse_uring_destruct(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + int qid; + + if (!ring) + return; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + + if (!queue) + continue; + + WARN_ON(!list_empty(&queue->ent_avail_queue)); + WARN_ON(!list_empty(&queue->ent_commit_queue)); + + kfree(queue); + ring->queues[qid] = NULL; + } + + kfree(ring->queues); + kfree(ring); + fc->ring = NULL; +} + +/* + * Basic ring setup for this connection based on the provided configuration + */ +static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) +{ + struct fuse_ring *ring; + size_t nr_queues = num_possible_cpus(); + struct fuse_ring *res = NULL; + size_t max_payload_size; + + ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); + if (!ring) + return NULL; + + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), + GFP_KERNEL_ACCOUNT); + if (!ring->queues) + goto out_err; + + max_payload_size = max_t(unsigned int, FUSE_MIN_READ_BUFFER, fc->max_write); + max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + + spin_lock(&fc->lock); + if (fc->ring) { + /* race, another thread created the ring in the meantime */ + spin_unlock(&fc->lock); + res = fc->ring; + goto out_err; + } + + fc->ring = ring; + ring->nr_queues = nr_queues; + ring->fc = fc; + ring->max_payload_sz = max_payload_size; + + spin_unlock(&fc->lock); + return ring; + +out_err: + kfree(ring->queues); + kfree(ring); + return res; +} + +static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, + int qid) +{ + struct fuse_conn *fc = ring->fc; + struct fuse_ring_queue *queue; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); + if (!queue) + return NULL; + queue->qid = qid; + queue->ring = ring; + spin_lock_init(&queue->lock); + + INIT_LIST_HEAD(&queue->ent_avail_queue); + INIT_LIST_HEAD(&queue->ent_commit_queue); + + spin_lock(&fc->lock); + if (ring->queues[qid]) { + spin_unlock(&fc->lock); + kfree(queue); + return ring->queues[qid]; + } + + /* + * write_once and lock as the caller mostly doesn't take the lock at all + */ + WRITE_ONCE(ring->queues[qid], queue); + spin_unlock(&fc->lock); + + return queue; +} + +/* + * Make a ring entry available for fuse_req assignment + */ +static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue) +{ + WARN_ON_ONCE(!ent->cmd); + list_move(&ent->list, &queue->ent_avail_queue); + ent->state = FRRS_AVAILABLE; +} + +/* + * fuse_uring_req_fetch command handling + */ +static void fuse_uring_do_register(struct fuse_ring_ent *ent, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->cmd = cmd; + fuse_uring_ent_avail(ent, queue); + spin_unlock(&queue->lock); +} + +/* + * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] + * the payload + */ +static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, + struct iovec iov[FUSE_URING_IOV_SEGS]) +{ + struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); + struct iov_iter iter; + ssize_t ret; + + if (sqe->len != FUSE_URING_IOV_SEGS) + return -EINVAL; + + /* + * Direction for buffer access will actually be READ and WRITE, + * using write for the import should include READ access as well. + */ + ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, + FUSE_URING_IOV_SEGS, &iov, &iter); + if (ret < 0) + return ret; + + return 0; +} + +static struct fuse_ring_ent * +fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent; + size_t payload_size; + struct iovec iov[FUSE_URING_IOV_SEGS]; + int err; + + err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); + if (err) { + pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", + err); + return ERR_PTR(err); + } + + err = -EINVAL; + if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { + pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); + return ERR_PTR(err); + } + + payload_size = iov[1].iov_len; + if (payload_size < ring->max_payload_sz) { + pr_info_ratelimited("Invalid req payload len %zu\n", + payload_size); + return ERR_PTR(err); + } + + err = -ENOMEM; + ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); + if (!ent) + return ERR_PTR(err); + + INIT_LIST_HEAD(&ent->list); + + ent->queue = queue; + ent->headers = iov[0].iov_base; + ent->payload = iov[1].iov_base; + + return ent; +} + +/* + * Register header and payload buffer with the kernel and puts the + * entry as "ready to get fuse requests" on the queue + */ +static int fuse_uring_register(struct io_uring_cmd *cmd, + unsigned int issue_flags, struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent; + int err; + unsigned int qid = READ_ONCE(cmd_req->qid); + + err = -ENOMEM; + if (!ring) { + ring = fuse_uring_create(fc); + if (!ring) + return err; + } + + if (qid >= ring->nr_queues) { + pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); + return -EINVAL; + } + + queue = ring->queues[qid]; + if (!queue) { + queue = fuse_uring_create_queue(ring, qid); + if (!queue) + return err; + } + + /* + * The created queue above does not need to be destructed in + * case of entry errors below, will be done at ring destruction time. + */ + + ent = fuse_uring_create_ring_ent(cmd, queue); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + fuse_uring_do_register(ent, cmd, issue_flags); + + return 0; +} + +/* + * Entry function from io_uring to handle the given passthrough command + * (op code IORING_OP_URING_CMD) + */ +int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_dev *fud; + struct fuse_conn *fc; + u32 cmd_op = cmd->cmd_op; + int err; + + if (!enable_uring) { + pr_info_ratelimited("fuse-io-uring is disabled\n"); + return -EOPNOTSUPP; + } + + /* This extra SQE size holds struct fuse_uring_cmd_req */ + if (!(issue_flags & IO_URING_F_SQE128)) + return -EINVAL; + + fud = fuse_get_dev(cmd->file); + if (!fud) { + pr_info_ratelimited("No fuse device found\n"); + return -ENOTCONN; + } + fc = fud->fc; + + if (fc->aborted) + return -ECONNABORTED; + if (!fc->connected) + return -ENOTCONN; + + /* + * fuse_uring_register() needs the ring to be initialized, + * we need to know the max payload size + */ + if (!fc->initialized) + return -EAGAIN; + + switch (cmd_op) { + case FUSE_IO_URING_CMD_REGISTER: + err = fuse_uring_register(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", + err); + return err; + } + break; + default: + return -EINVAL; + } + + return -EIOCBQUEUED; +} diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h new file mode 100644 index 00000000000000..ae1536355b3685 --- /dev/null +++ b/fs/fuse/dev_uring_i.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#ifndef _FS_FUSE_DEV_URING_I_H +#define _FS_FUSE_DEV_URING_I_H + +#include "fuse_i.h" + +#ifdef CONFIG_FUSE_IO_URING + +enum fuse_ring_req_state { + FRRS_INVALID = 0, + + /* The ring entry received from userspace and it is being processed */ + FRRS_COMMIT, + + /* The ring entry is waiting for new fuse requests */ + FRRS_AVAILABLE, + + /* The ring entry is in or on the way to user space */ + FRRS_USERSPACE, +}; + +/** A fuse ring entry, part of the ring queue */ +struct fuse_ring_ent { + /* userspace buffer */ + struct fuse_uring_req_header __user *headers; + void __user *payload; + + /* the ring queue that owns the request */ + struct fuse_ring_queue *queue; + + /* fields below are protected by queue->lock */ + + struct io_uring_cmd *cmd; + + struct list_head list; + + enum fuse_ring_req_state state; + + struct fuse_req *fuse_req; +}; + +struct fuse_ring_queue { + /* + * back pointer to the main fuse uring structure that holds this + * queue + */ + struct fuse_ring *ring; + + /* queue id, corresponds to the cpu core */ + unsigned int qid; + + /* + * queue lock, taken when any value in the queue changes _and_ also + * a ring entry state changes. + */ + spinlock_t lock; + + /* available ring entries (struct fuse_ring_ent) */ + struct list_head ent_avail_queue; + + /* + * entries in the process of being committed or in the process + * to be sent to userspace + */ + struct list_head ent_commit_queue; +}; + +/** + * Describes if uring is for communication and holds alls the data needed + * for uring communication + */ +struct fuse_ring { + /* back pointer */ + struct fuse_conn *fc; + + /* number of ring queues */ + size_t nr_queues; + + /* maximum payload/arg size */ + size_t max_payload_sz; + + struct fuse_ring_queue **queues; +}; + +bool fuse_uring_enabled(void); +void fuse_uring_destruct(struct fuse_conn *fc); +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); + +#else /* CONFIG_FUSE_IO_URING */ + +struct fuse_ring; + +static inline void fuse_uring_create(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_destruct(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_enabled(void) +{ + return false; +} + +#endif /* CONFIG_FUSE_IO_URING */ + +#endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bd56e3acf3ac6c..b96d078d29fc0e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -873,6 +873,11 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; + +#ifdef CONFIG_FUSE_IO_URING + /** uring connection information*/ + struct fuse_ring *ring; +#endif }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bd34ea550b27f0..800a36adba0ab4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "dev_uring_i.h" #include #include @@ -946,6 +947,8 @@ static void delayed_release(struct rcu_head *p) { struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + fuse_uring_destruct(fc); + put_user_ns(fc->user_ns); fc->release(fc); } @@ -1359,6 +1362,13 @@ void fuse_send_init(struct fuse_mount *fm) if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; + /* + * This is just an information flag for fuse server. No need to check + * the reply - server is either sending IORING_OP_URING_CMD or not. + */ + if (fuse_uring_enabled()) + flags |= FUSE_OVER_IO_URING; + ia->in.flags = flags; ia->in.flags2 = flags >> 32; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index ecd01865aee52b..c656bc1b603eae 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -205,6 +205,15 @@ * 7.39 * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures + * 7.42 + * - Add FUSE_OVER_IO_URING and all other io-uring related flags and data + * structures: + * - struct fuse_uring_ent_in_out + * - struct fuse_uring_req_header + * - struct fuse_uring_cmd_req + * - FUSE_URING_IN_OUT_HEADER_SZ + * - FUSE_URING_OP_IN_OUT_SZ + * - enum fuse_uring_cmd */ #ifndef _LINUX_FUSE_H @@ -404,6 +413,7 @@ struct fuse_file_lock { * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. + * FUSE_OVER_IO_URING: Indicate that client supports io-uring */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -446,6 +456,7 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP +#define FUSE_OVER_IO_URING (1ULL << 41) /** * CUSE INIT request/reply flags @@ -1147,4 +1158,67 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Size of the ring buffer header + */ +#define FUSE_URING_IN_OUT_HEADER_SZ 128 +#define FUSE_URING_OP_IN_OUT_SZ 128 + +/* Used as part of the fuse_uring_req_header */ +struct fuse_uring_ent_in_out { + uint64_t flags; + + /* + * commit ID to be used in a reply to a ring request (see also + * struct fuse_uring_cmd_req) + */ + uint64_t commit_id; + + /* size of user payload buffer */ + uint32_t payload_sz; + uint32_t padding; + + uint64_t reserved; +}; + +/** + * Header for all fuse-io-uring requests + */ +struct fuse_uring_req_header { + /* struct fuse_in_header / struct fuse_out_header */ + char in_out[FUSE_URING_IN_OUT_HEADER_SZ]; + + /* per op code header */ + char op_in[FUSE_URING_OP_IN_OUT_SZ]; + + struct fuse_uring_ent_in_out ring_ent_in_out; +}; + +/** + * sqe commands to the kernel + */ +enum fuse_uring_cmd { + FUSE_IO_URING_CMD_INVALID = 0, + + /* register the request buffer and fetch a fuse request */ + FUSE_IO_URING_CMD_REGISTER = 1, + + /* commit fuse request result and fetch next request */ + FUSE_IO_URING_CMD_COMMIT_AND_FETCH = 2, +}; + +/** + * In the 80B command area of the SQE. + */ +struct fuse_uring_cmd_req { + uint64_t flags; + + /* entry identifier for commits */ + uint64_t commit_id; + + /* queue the command is for (queue index) */ + uint16_t qid; + uint8_t padding[6]; +}; + #endif /* _LINUX_FUSE_H */ From cedc5c88a6f0d75ca47c3e9d90dcf3f85599cf66 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:00 +0100 Subject: [PATCH 092/161] fuse: Make fuse_copy non static Move 'struct fuse_copy_state' and fuse_copy_* functions to fuse_dev_i.h to make it available for fuse-io-uring. 'copy_out_args()' is renamed to 'fuse_copy_out_args'. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit d0f9c62aaf7a98412b46a91fe7daad76b316b3b7) (cherry picked from commit c6641bd82b91faa165116c4414b3606e8807f690) (cherry picked from commit ceebe59a8c9e84bd346e9376d0e537ffb00bd602) --- fs/fuse/dev.c | 30 ++++++++---------------------- fs/fuse/fuse_dev_i.h | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 99afdfd63b0119..f1a495df3c173a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -629,22 +629,8 @@ static int unlock_request(struct fuse_req *req) return err; } -struct fuse_copy_state { - int write; - struct fuse_req *req; - struct iov_iter *iter; - struct pipe_buffer *pipebufs; - struct pipe_buffer *currbuf; - struct pipe_inode_info *pipe; - unsigned long nr_segs; - struct page *pg; - unsigned len; - unsigned offset; - unsigned move_pages:1; -}; - -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct iov_iter *iter) +void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); cs->write = write; @@ -995,9 +981,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) } /* Copy request arguments to/from userspace buffer */ -static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, - unsigned argpages, struct fuse_arg *args, - int zeroing) +int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) { int err = 0; unsigned i; @@ -1807,8 +1793,8 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, - unsigned nbytes) +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned nbytes) { unsigned reqsize = sizeof(struct fuse_out_header); @@ -1910,7 +1896,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; else - err = copy_out_args(cs, req->args, nbytes); + err = fuse_copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 08a7e88e002773..21eb1bdb492d04 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,23 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +struct fuse_arg; +struct fuse_args; + +struct fuse_copy_state { + int write; + struct fuse_req *req; + struct iov_iter *iter; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + struct page *pg; + unsigned int len; + unsigned int offset; + unsigned int move_pages:1; +}; + static inline struct fuse_dev *fuse_get_dev(struct file *file) { /* @@ -23,5 +40,13 @@ static inline struct fuse_dev *fuse_get_dev(struct file *file) void fuse_dev_end_requests(struct list_head *head); +void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter); +int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, + unsigned int argpages, struct fuse_arg *args, + int zeroing); +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned int nbytes); + #endif From 59953a7b884bd2b9e153a3e2cc80a5168d8fac3b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:01 +0100 Subject: [PATCH 093/161] fuse: Add fuse-io-uring handling into fuse_copy Add special fuse-io-uring into the fuse argument copy handler. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit f773a7c2c3d934d00542c6471170066d150d152f) (cherry picked from commit 27743ea615350d3866486a71184f4dc5caa26049) (cherry picked from commit f8e09704758e36b80de7cb27ad9399241d3e05db) --- fs/fuse/dev.c | 12 +++++++++++- fs/fuse/fuse_dev_i.h | 4 ++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f1a495df3c173a..784d4a61a521f0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -737,6 +737,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) *size -= ncpy; cs->len -= ncpy; cs->offset += ncpy; + if (cs->is_uring) + cs->ring.copied_sz += ncpy; + return ncpy; } @@ -1796,7 +1799,14 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned nbytes) { - unsigned reqsize = sizeof(struct fuse_out_header); + + unsigned int reqsize = 0; + + /* + * Uring has all headers separated from args - args is payload only + */ + if (!cs->is_uring) + reqsize = sizeof(struct fuse_out_header); reqsize += fuse_len_args(args->out_numargs, args->out_args); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 21eb1bdb492d04..4a8a4feb2df53f 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -27,6 +27,10 @@ struct fuse_copy_state { unsigned int len; unsigned int offset; unsigned int move_pages:1; + unsigned int is_uring:1; + struct { + unsigned int copied_sz; /* copied size into the user buffer */ + } ring; }; static inline struct fuse_dev *fuse_get_dev(struct file *file) From 7f3c9b91b8973b9c8aa2e807c0b7ca92d16f2f57 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:02 +0100 Subject: [PATCH 094/161] fuse: {io-uring} Make hash-list req unique finding functions non-static fuse-over-io-uring uses existing functions to find requests based on their unique id - make these functions non-static. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 3821336530616776414aa7a879640052b89def28) (cherry picked from commit efcaee33b682f9d11fa45ae693f32e35640876d5) (cherry picked from commit 09163779af8cb6eb77ea8af9cad5d262ddc1bc94) --- fs/fuse/dev.c | 6 +++--- fs/fuse/fuse_dev_i.h | 5 +++++ fs/fuse/fuse_i.h | 5 +++++ fs/fuse/inode.c | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 784d4a61a521f0..6558fb5a7aaa80 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -187,7 +187,7 @@ u64 fuse_get_unique(struct fuse_iqueue *fiq) } EXPORT_SYMBOL_GPL(fuse_get_unique); -static unsigned int fuse_req_hash(u64 unique) +unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } @@ -1784,7 +1784,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique) { unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; @@ -1868,7 +1868,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_lock(&fpq->lock); req = NULL; if (fpq->connected) - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); + req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); err = -ENOENT; if (!req) { diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 4a8a4feb2df53f..b64ab84cbc0d51 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -14,6 +14,8 @@ struct fuse_arg; struct fuse_args; +struct fuse_pqueue; +struct fuse_req; struct fuse_copy_state { int write; @@ -42,6 +44,9 @@ static inline struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } +unsigned int fuse_req_hash(u64 unique); +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); + void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, int write, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b96d078d29fc0e..3322639bd86b89 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1174,6 +1174,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); */ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); +/** + * Initialize the fuse processing queue + */ +void fuse_pqueue_init(struct fuse_pqueue *fpq); + /** * Initialize fuse_conn */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 800a36adba0ab4..34adf038b492f1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -896,7 +896,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq, fiq->priv = priv; } -static void fuse_pqueue_init(struct fuse_pqueue *fpq) +void fuse_pqueue_init(struct fuse_pqueue *fpq) { unsigned int i; From 5444cffcdf2ab4d6ff2bc65573f392f3461589c3 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:03 +0100 Subject: [PATCH 095/161] fuse: Add io-uring sqe commit and fetch support This adds support for fuse request completion through ring SQEs (FUSE_URING_CMD_COMMIT_AND_FETCH handling). After committing the ring entry it becomes available for new fuse requests. Handling of requests through the ring (SQE/CQE handling) is complete now. Fuse request data are copied through the mmaped ring buffer, there is no support for any zero copy yet. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit c090c8abae4b6b77a1bee116aa6c385456ebef96) (cherry picked from commit 6d7c5e73ebdb68441729d6ec1416531789403a4b) (cherry picked from commit aff1bea413312212d721c8f3bad34419d8e7cac0) --- fs/fuse/dev_uring.c | 452 ++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 12 ++ fs/fuse/fuse_i.h | 4 + 3 files changed, 468 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index bff79fc60a3157..6c1ef50978ee04 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -25,6 +25,17 @@ bool fuse_uring_enabled(void) return enable_uring; } +static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, + int error) +{ + ent->fuse_req = NULL; + if (error) + req->out.h.error = error; + + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -40,8 +51,11 @@ void fuse_uring_destruct(struct fuse_conn *fc) continue; WARN_ON(!list_empty(&queue->ent_avail_queue)); + WARN_ON(!list_empty(&queue->ent_w_req_queue)); WARN_ON(!list_empty(&queue->ent_commit_queue)); + WARN_ON(!list_empty(&queue->ent_in_userspace)); + kfree(queue->fpq.processing); kfree(queue); ring->queues[qid] = NULL; } @@ -100,20 +114,34 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; + struct list_head *pq; queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); if (!queue) return NULL; + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(queue); + return NULL; + } + queue->qid = qid; queue->ring = ring; spin_lock_init(&queue->lock); INIT_LIST_HEAD(&queue->ent_avail_queue); INIT_LIST_HEAD(&queue->ent_commit_queue); + INIT_LIST_HEAD(&queue->ent_w_req_queue); + INIT_LIST_HEAD(&queue->ent_in_userspace); + INIT_LIST_HEAD(&queue->fuse_req_queue); + + queue->fpq.processing = pq; + fuse_pqueue_init(&queue->fpq); spin_lock(&fc->lock); if (ring->queues[qid]) { spin_unlock(&fc->lock); + kfree(queue->fpq.processing); kfree(queue); return ring->queues[qid]; } @@ -127,6 +155,225 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, return queue; } +/* + * Checks for errors and stores it into the request + */ +static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, + struct fuse_req *req, + struct fuse_conn *fc) +{ + int err; + + err = -EINVAL; + if (oh->unique == 0) { + /* Not supported through io-uring yet */ + pr_warn_once("notify through fuse-io-uring not supported\n"); + goto err; + } + + if (oh->error <= -ERESTARTSYS || oh->error > 0) + goto err; + + if (oh->error) { + err = oh->error; + goto err; + } + + err = -ENOENT; + if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { + pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", + req->in.h.unique, + oh->unique & ~FUSE_INT_REQ_BIT); + goto err; + } + + /* + * Is it an interrupt reply ID? + * XXX: Not supported through fuse-io-uring yet, it should not even + * find the request - should not happen. + */ + WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); + + err = 0; +err: + return err; +} + +static int fuse_import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(buf, len))) + return -EFAULT; + + iov_iter_ubuf(i, rw, buf, len); + return 0; +} + +static int fuse_uring_copy_from_ring(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct iov_iter iter; + int err; + struct fuse_uring_ent_in_out ring_in_out; + + err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, + sizeof(ring_in_out)); + if (err) + return -EFAULT; + + err = fuse_import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, + &iter); + if (err) + return err; + + fuse_copy_init(&cs, 0, &iter); + cs.is_uring = 1; + cs.req = req; + + return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); +} + + /* + * Copy data from the req to the ring buffer + */ +static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + struct iov_iter iter; + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + err = fuse_import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); + if (err) { + pr_info_ratelimited("fuse: Import of user buffer failed\n"); + return err; + } + + fuse_copy_init(&cs, 1, &iter); + cs.is_uring = 1; + cs.req = req; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + err = copy_to_user(&ent->headers->op_in, in_args->value, + in_args->size); + if (err) { + pr_info_ratelimited( + "Copying the header failed.\n"); + return -EFAULT; + } + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, + sizeof(ent_in_out)); + return err ? -EFAULT : 0; +} + +static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + int err; + + err = -EIO; + if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { + pr_err("qid=%d ring-req=%p invalid state %d on send\n", + queue->qid, ent, ent->state); + return err; + } + + err = -EINVAL; + if (WARN_ON(req->in.h.unique == 0)) + return err; + + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + + /* copy fuse_in_header */ + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) { + err = -EFAULT; + return err; + } + + return 0; +} + +static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + int err; + + err = fuse_uring_copy_to_ring(ent, req); + if (!err) + set_bit(FR_SENT, &req->flags); + else + fuse_uring_req_end(ent, req, err); + + return err; +} + +/* + * Write data to the ring buffer and send the request to userspace, + * userspace will read it + * This is comparable with classical read(/dev/fuse) + */ +static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + int err; + struct io_uring_cmd *cmd; + + err = fuse_uring_prepare_send(ent, req); + if (err) + return err; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, 0, 0, issue_flags); + return 0; +} + /* * Make a ring entry available for fuse_req assignment */ @@ -138,6 +385,203 @@ static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, ent->state = FRRS_AVAILABLE; } +/* Used to find the request on SQE commit */ +static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_pqueue *fpq = &queue->fpq; + unsigned int hash; + + req->ring_entry = ent; + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); +} + +/* + * Assign a fuse queue entry to the given entry + */ +static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_conn *fc = req->fm->fc; + struct fuse_iqueue *fiq = &fc->iq; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && + ent->state != FRRS_COMMIT)) { + pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, + ent->state); + } + + spin_lock(&fiq->lock); + clear_bit(FR_PENDING, &req->flags); + spin_unlock(&fiq->lock); + ent->fuse_req = req; + ent->state = FRRS_FUSE_REQ; + list_move(&ent->list, &queue->ent_w_req_queue); + fuse_uring_add_to_pq(ent, req); +} + +/* Fetch the next fuse request if available */ +static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) + __must_hold(&queue->lock) +{ + struct fuse_req *req; + struct fuse_ring_queue *queue = ent->queue; + struct list_head *req_queue = &queue->fuse_req_queue; + + lockdep_assert_held(&queue->lock); + + /* get and assign the next entry while it is still holding the lock */ + req = list_first_entry_or_null(req_queue, struct fuse_req, list); + if (req) + fuse_uring_add_req_to_ring_ent(ent, req); + + return req; +} + +/* + * Read data from the ring buffer, which user space has written to + * This is comparible with handling of classical write(/dev/fuse). + * Also make the ring request available again for new fuse requests. + */ +static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring *ring = ent->queue->ring; + struct fuse_conn *fc = ring->fc; + ssize_t err = 0; + + err = copy_from_user(&req->out.h, &ent->headers->in_out, + sizeof(req->out.h)); + if (err) { + req->out.h.error = -EFAULT; + goto out; + } + + err = fuse_uring_out_header_has_err(&req->out.h, req, fc); + if (err) { + /* req->out.h.error already set */ + goto out; + } + + err = fuse_uring_copy_from_ring(ring, req, ent); +out: + fuse_uring_req_end(ent, req, err); +} + +/* + * Get the next fuse req and send it + */ +static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue, + unsigned int issue_flags) +{ + int err; + struct fuse_req *req; + +retry: + spin_lock(&queue->lock); + fuse_uring_ent_avail(ent, queue); + req = fuse_uring_ent_assign_req(ent); + spin_unlock(&queue->lock); + + if (req) { + err = fuse_uring_send_next_to_ring(ent, req, issue_flags); + if (err) + goto retry; + } +} + +static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) +{ + struct fuse_ring_queue *queue = ent->queue; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) + return -EIO; + + ent->state = FRRS_COMMIT; + list_move(&ent->list, &queue->ent_commit_queue); + + return 0; +} + +/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ +static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring_ent *ent; + int err; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + uint64_t commit_id = READ_ONCE(cmd_req->commit_id); + unsigned int qid = READ_ONCE(cmd_req->qid); + struct fuse_pqueue *fpq; + struct fuse_req *req; + + err = -ENOTCONN; + if (!ring) + return err; + + if (qid >= ring->nr_queues) + return -EINVAL; + + queue = ring->queues[qid]; + if (!queue) + return err; + fpq = &queue->fpq; + + spin_lock(&queue->lock); + /* Find a request based on the unique ID of the fuse request + * This should get revised, as it needs a hash calculation and list + * search. And full struct fuse_pqueue is needed (memory overhead). + * As well as the link from req to ring_ent. + */ + req = fuse_request_find(fpq, commit_id); + err = -ENOENT; + if (!req) { + pr_info("qid=%d commit_id %llu not found\n", queue->qid, + commit_id); + spin_unlock(&queue->lock); + return err; + } + list_del_init(&req->list); + ent = req->ring_entry; + req->ring_entry = NULL; + + err = fuse_ring_ent_set_commit(ent); + if (err != 0) { + pr_info_ratelimited("qid=%d commit_id %llu state %d", + queue->qid, commit_id, ent->state); + spin_unlock(&queue->lock); + req->out.h.error = err; + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); + return err; + } + + ent->cmd = cmd; + spin_unlock(&queue->lock); + + /* without the queue lock, as other locks are taken */ + fuse_uring_commit(ent, req, issue_flags); + + /* + * Fetching the next request is absolutely required as queued + * fuse requests would otherwise not get processed - committing + * and fetching is done in one step vs legacy fuse, which has separated + * read (fetch request) and write (commit result). + */ + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; +} + /* * fuse_uring_req_fetch command handling */ @@ -319,6 +763,14 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, return err; } break; + case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: + err = fuse_uring_commit_fetch(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", + err); + return err; + } + break; default: return -EINVAL; } diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index ae1536355b3685..44bf237f0d5abc 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -20,6 +20,9 @@ enum fuse_ring_req_state { /* The ring entry is waiting for new fuse requests */ FRRS_AVAILABLE, + /* The ring entry got assigned a fuse req */ + FRRS_FUSE_REQ, + /* The ring entry is in or on the way to user space */ FRRS_USERSPACE, }; @@ -67,7 +70,16 @@ struct fuse_ring_queue { * entries in the process of being committed or in the process * to be sent to userspace */ + struct list_head ent_w_req_queue; struct list_head ent_commit_queue; + + /* entries in userspace */ + struct list_head ent_in_userspace; + + /* fuse requests waiting for an entry slot */ + struct list_head fuse_req_queue; + + struct fuse_pqueue fpq; }; /** diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3322639bd86b89..2f7df313fc42f9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -399,6 +399,10 @@ struct fuse_req { /** fuse_mount this request belongs to */ struct fuse_mount *fm; + +#ifdef CONFIG_FUSE_IO_URING + void *ring_entry; +#endif }; struct fuse_iqueue; From b950b46fe78d45b2a94a1efdcaa8bcf086fa6bf5 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:04 +0100 Subject: [PATCH 096/161] fuse: {io-uring} Handle teardown of ring entries On teardown struct file_operations::uring_cmd requests need to be completed by calling io_uring_cmd_done(). Not completing all ring entries would result in busy io-uring tasks giving warning messages in intervals and unreleased struct file. Additionally the fuse connection and with that the ring can only get released when all io-uring commands are completed. Completion is done with ring entries that are a) in waiting state for new fuse requests - io_uring_cmd_done is needed b) already in userspace - io_uring_cmd_done through teardown is not needed, the request can just get released. If fuse server is still active and commits such a ring entry, fuse_uring_cmd() already checks if the connection is active and then complete the io-uring itself with -ENOTCONN. I.e. special handling is not needed. This scheme is basically represented by the ring entry state FRRS_WAIT and FRRS_USERSPACE. Entries in state: - FRRS_INIT: No action needed, do not contribute to ring->queue_refs yet - All other states: Are currently processed by other tasks, async teardown is needed and it has to wait for the two states above. It could be also solved without an async teardown task, but would require additional if conditions in hot code paths. Also in my personal opinion the code looks cleaner with async teardown. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 4a9bfb9b6850fec0685447aed280533cf980de70) (cherry picked from commit f8f69d2f6dd20a9d74ee1531c66ac519079a1a25) (cherry picked from commit a2fca2a7c872ce35553cbbac5ad1af743c735151) --- fs/fuse/dev.c | 9 ++ fs/fuse/dev_uring.c | 207 ++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 51 +++++++++++ 3 files changed, 267 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6558fb5a7aaa80..0a0a152d7c8ccb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -6,6 +6,7 @@ See the file COPYING. */ +#include "dev_uring_i.h" #include "fuse_i.h" #include "fuse_dev_i.h" @@ -2165,6 +2166,12 @@ void fuse_abort_conn(struct fuse_conn *fc) spin_unlock(&fc->lock); fuse_dev_end_requests(&to_end); + + /* + * fc->lock must not be taken to avoid conflicts with io-uring + * locks + */ + fuse_uring_abort(fc); } else { spin_unlock(&fc->lock); } @@ -2176,6 +2183,8 @@ void fuse_wait_aborted(struct fuse_conn *fc) /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + + fuse_uring_wait_stopped_queues(fc); } int fuse_dev_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 6c1ef50978ee04..9ba1c132ebdc1e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -36,6 +36,37 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, fuse_request_end(req); } +/* Abort all list queued request on the given ring queue */ +static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) +{ + struct fuse_req *req; + LIST_HEAD(req_list); + + spin_lock(&queue->lock); + list_for_each_entry(req, &queue->fuse_req_queue, list) + clear_bit(FR_PENDING, &req->flags); + list_splice_init(&queue->fuse_req_queue, &req_list); + spin_unlock(&queue->lock); + + /* must not hold queue lock to avoid order issues with fi->lock */ + fuse_dev_end_requests(&req_list); +} + +void fuse_uring_abort_end_requests(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_queue *queue; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + queue->stopped = true; + fuse_uring_abort_end_queue_requests(queue); + } +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -95,10 +126,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) goto out_err; } + init_waitqueue_head(&ring->stop_waitq); + fc->ring = ring; ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; + atomic_set(&ring->queue_refs, 0); spin_unlock(&fc->lock); return ring; @@ -155,6 +189,175 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, return queue; } +static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) +{ + clear_bit(FR_SENT, &req->flags); + req->out.h.error = -ECONNABORTED; + fuse_request_end(req); +} + +/* + * Release a request/entry on connection tear down + */ +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +{ + struct fuse_req *req; + struct io_uring_cmd *cmd; + + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + req = ent->fuse_req; + ent->fuse_req = NULL; + if (req) { + /* remove entry from queue->fpq->processing */ + list_del_init(&req->list); + } + spin_unlock(&queue->lock); + + if (cmd) + io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + + if (req) + fuse_uring_stop_fuse_req_end(req); + + list_del_init(&ent->list); + kfree(ent); +} + +static void fuse_uring_stop_list_entries(struct list_head *head, + struct fuse_ring_queue *queue, + enum fuse_ring_req_state exp_state) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent, *next; + ssize_t queue_refs = SSIZE_MAX; + LIST_HEAD(to_teardown); + + spin_lock(&queue->lock); + list_for_each_entry_safe(ent, next, head, list) { + if (ent->state != exp_state) { + pr_warn("entry teardown qid=%d state=%d expected=%d", + queue->qid, ent->state, exp_state); + continue; + } + + list_move(&ent->list, &to_teardown); + } + spin_unlock(&queue->lock); + + /* no queue lock to avoid lock order issues */ + list_for_each_entry_safe(ent, next, &to_teardown, list) { + fuse_uring_entry_teardown(ent); + queue_refs = atomic_dec_return(&ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); + } +} + +static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) +{ + fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, + FRRS_USERSPACE); + fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, + FRRS_AVAILABLE); +} + +/* + * Log state debug info + */ +static void fuse_uring_log_ent_state(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_ent *ent; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + + if (!queue) + continue; + + spin_lock(&queue->lock); + /* + * Log entries from the intermediate queue, the other queues + * should be empty + */ + list_for_each_entry(ent, &queue->ent_w_req_queue, list) { + pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + list_for_each_entry(ent, &queue->ent_commit_queue, list) { + pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + spin_unlock(&queue->lock); + } + ring->stop_debug_log = 1; +} + +static void fuse_uring_async_stop_queues(struct work_struct *work) +{ + int qid; + struct fuse_ring *ring = + container_of(work, struct fuse_ring, async_teardown_work.work); + + /* XXX code dup */ + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + /* + * Some ring entries might be in the middle of IO operations, + * i.e. in process to get handled by file_operations::uring_cmd + * or on the way to userspace - we could handle that with conditions in + * run time code, but easier/cleaner to have an async tear down handler + * If there are still queue references left + */ + if (atomic_read(&ring->queue_refs) > 0) { + if (time_after(jiffies, + ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) + fuse_uring_log_ent_state(ring); + + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Stop the ring queues + */ +void fuse_uring_stop_queues(struct fuse_ring *ring) +{ + int qid; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + if (atomic_read(&ring->queue_refs) > 0) { + ring->teardown_time = jiffies; + INIT_DELAYED_WORK(&ring->async_teardown_work, + fuse_uring_async_stop_queues); + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + /* * Checks for errors and stores it into the request */ @@ -537,6 +740,9 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return err; fpq = &queue->fpq; + if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) + return err; + spin_lock(&queue->lock); /* Find a request based on the unique ID of the fuse request * This should get revised, as it needs a hash calculation and list @@ -664,6 +870,7 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + atomic_inc(&ring->queue_refs); return ent; } diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 44bf237f0d5abc..a4316e118cbd80 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -11,6 +11,9 @@ #ifdef CONFIG_FUSE_IO_URING +#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ) +#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20) + enum fuse_ring_req_state { FRRS_INVALID = 0, @@ -80,6 +83,8 @@ struct fuse_ring_queue { struct list_head fuse_req_queue; struct fuse_pqueue fpq; + + bool stopped; }; /** @@ -97,12 +102,51 @@ struct fuse_ring { size_t max_payload_sz; struct fuse_ring_queue **queues; + + /* + * Log ring entry states on stop when entries cannot be released + */ + unsigned int stop_debug_log : 1; + + wait_queue_head_t stop_waitq; + + /* async tear down */ + struct delayed_work async_teardown_work; + + /* log */ + unsigned long teardown_time; + + atomic_t queue_refs; }; bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); +void fuse_uring_stop_queues(struct fuse_ring *ring); +void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring == NULL) + return; + + if (atomic_read(&ring->queue_refs) > 0) { + fuse_uring_abort_end_requests(ring); + fuse_uring_stop_queues(ring); + } +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring) + wait_event(ring->stop_waitq, + atomic_read(&ring->queue_refs) == 0); +} + #else /* CONFIG_FUSE_IO_URING */ struct fuse_ring; @@ -120,6 +164,13 @@ static inline bool fuse_uring_enabled(void) return false; } +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ +} #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ From d92ae76c86e3c87d17280da5261caba126c8d52f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 May 2024 17:09:07 +0200 Subject: [PATCH 097/161] fuse: cleanup request queuing towards virtiofs Virtiofs has its own queuing mechanism, but still requests are first queued on fiq->pending to be immediately dequeued and queued onto the virtio queue. The queuing on fiq->pending is unnecessary and might even have some performance impact due to being a contention point. Forget requests are handled similarly. Move the queuing of requests and forgets into the fiq->ops->*. fuse_iqueue_ops are renamed to reflect the new semantics. Reviewed-by: Stefan Hajnoczi Fixed-by: Jingbo Xu Reviewed-by: Jingbo Xu Tested-by: Peter-Jan Gootzen Reviewed-by: Peter-Jan Gootzen Signed-off-by: Miklos Szeredi (cherry picked from commit 5de8acb41c86f1d335d165e0a350441ea3a1f480) (cherry picked from commit aaf0f3b94852178a88090bd838133b34dc9290f9) (cherry picked from commit c71fdf7c84c7394d6aeb7852a883227c52cbf73e) --- fs/fuse/dev.c | 157 ++++++++++++++++++++++++-------------------- fs/fuse/fuse_i.h | 19 ++---- fs/fuse/virtio_fs.c | 41 ++++-------- 3 files changed, 105 insertions(+), 112 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a0a152d7c8ccb..4ea871b89afb62 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -181,11 +181,22 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -u64 fuse_get_unique(struct fuse_iqueue *fiq) +static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } + +u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + u64 ret; + + spin_lock(&fiq->lock); + ret = fuse_get_unique_locked(fiq); + spin_unlock(&fiq->lock); + + return ret; +} EXPORT_SYMBOL_GPL(fuse_get_unique); unsigned int fuse_req_hash(u64 unique) @@ -204,22 +215,68 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + fuse_dev_wake_and_unlock(fiq); + } else { + kfree(forget); + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from fuse_request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + } else { + fuse_dev_wake_and_unlock(fiq); + } + } else { + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + list_add_tail(&req->list, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); + req->out.h.error = -ENOTCONN; + fuse_request_end(req); + } +} + const struct fuse_iqueue_ops fuse_dev_fiq_ops = { - .wake_forget_and_unlock = fuse_dev_wake_and_unlock, - .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, - .wake_pending_and_unlock = fuse_dev_wake_and_unlock, + .send_forget = fuse_dev_queue_forget, + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_dev_queue_req, }; EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); -static void queue_request_and_unlock(struct fuse_iqueue *fiq, - struct fuse_req *req) -__releases(fiq->lock) +static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - list_add_tail(&req->list, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + fiq->ops->send_req(fiq, req); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -230,15 +287,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->lock); - if (fiq->connected) { - fiq->forget_list_tail->next = forget; - fiq->forget_list_tail = forget; - fiq->ops->wake_forget_and_unlock(fiq); - } else { - kfree(forget); - spin_unlock(&fiq->lock); - } + fiq->ops->send_forget(fiq, forget); } static void flush_bg_queue(struct fuse_conn *fc) @@ -252,9 +301,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->lock); - req->in.h.unique = fuse_get_unique(fiq); - queue_request_and_unlock(fiq, req); + fuse_send_one(fiq, req); } } @@ -324,29 +371,12 @@ static int queue_interrupt(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; - spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ - if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->lock); + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) return -EINVAL; - } - if (list_empty(&req->intr_entry)) { - list_add_tail(&req->intr_entry, &fiq->interrupts); - /* - * Pairs with smp_mb() implied by test_and_set_bit() - * from fuse_request_end(). - */ - smp_mb(); - if (test_bit(FR_FINISHED, &req->flags)) { - list_del_init(&req->intr_entry); - spin_unlock(&fiq->lock); - return 0; - } - fiq->ops->wake_interrupt_and_unlock(fiq); - } else { - spin_unlock(&fiq->lock); - } + fiq->ops->send_interrupt(fiq, req); + return 0; } @@ -401,21 +431,15 @@ static void __fuse_request_send(struct fuse_req *req) struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->lock); - if (!fiq->connected) { - spin_unlock(&fiq->lock); - req->out.h.error = -ENOTCONN; - } else { - req->in.h.unique = fuse_get_unique(fiq); - /* acquire extra reference, since request is still needed - after fuse_request_end() */ - __fuse_get_request(req); - queue_request_and_unlock(fiq, req); - request_wait_answer(req); - /* Pairs with smp_wmb() in fuse_request_end() */ - smp_rmb(); - } + /* acquire extra reference, since request is still needed after + fuse_request_end() */ + __fuse_get_request(req); + fuse_send_one(fiq, req); + + request_wait_answer(req); + /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); } static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) @@ -570,7 +594,6 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, { struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - int err = 0; req = fuse_get_req(fm, false); if (IS_ERR(req)) @@ -581,16 +604,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, fuse_args_to_req(req, args); - spin_lock(&fiq->lock); - if (fiq->connected) { - queue_request_and_unlock(fiq, req); - } else { - err = -ENODEV; - spin_unlock(&fiq->lock); - fuse_put_request(req); - } + fuse_send_one(fiq, req); - return err; + return 0; } /* @@ -1051,9 +1067,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp) +static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1072,7 +1088,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, return head; } -EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1087,7 +1102,7 @@ __releases(fiq->lock) struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1118,7 +1133,7 @@ __releases(fiq->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2f7df313fc42f9..b949779a06783e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -417,22 +417,19 @@ struct fuse_iqueue; */ struct fuse_iqueue_ops { /** - * Signal that a forget has been queued + * Send one forget */ - void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link); /** - * Signal that an INTERRUPT request has been queued + * Send interrupt for request */ - void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req); /** - * Signal that a request has been queued + * Send one request */ - void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req); /** * Clean up when fuse_iqueue is destroyed @@ -1031,10 +1028,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp); - /* * Initialize READ or READDIR request */ diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index f7eae090581a50..27e366e67e29ab 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -970,22 +970,13 @@ static struct virtio_driver virtio_fs_driver = { #endif }; -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link) { - struct fuse_forget_link *link; struct virtio_fs_forget *forget; struct virtio_fs_forget_req *req; - struct virtio_fs *fs; - struct virtio_fs_vq *fsvq; - u64 unique; - - link = fuse_dequeue_forget(fiq, 1, NULL); - unique = fuse_get_unique(fiq); - - fs = fiq->priv; - fsvq = &fs->vqs[VQ_HIPRIO]; - spin_unlock(&fiq->lock); + struct virtio_fs *fs = fiq->priv; + struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO]; + u64 unique = fuse_get_unique(fiq); /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); @@ -1005,8 +996,7 @@ __releases(fiq->lock) kfree(link); } -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { /* * TODO interrupts. @@ -1015,7 +1005,6 @@ __releases(fiq->lock) * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) * with shared lock between host and guest. */ - spin_unlock(&fiq->lock); } /* Count number of scatter-gather elements required */ @@ -1220,21 +1209,17 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, return ret; } -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) { unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ struct virtio_fs *fs; - struct fuse_req *req; struct virtio_fs_vq *fsvq; int ret; - WARN_ON(list_empty(&fiq->pending)); - req = list_last_entry(&fiq->pending, struct fuse_req, list); + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + clear_bit(FR_PENDING, &req->flags); - list_del_init(&req->list); - WARN_ON(!list_empty(&fiq->pending)); - spin_unlock(&fiq->lock); fs = fiq->priv; @@ -1272,10 +1257,10 @@ __releases(fiq->lock) } static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { - .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, - .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, - .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, - .release = virtio_fs_fiq_release, + .send_forget = virtio_fs_send_forget, + .send_interrupt = virtio_fs_send_interrupt, + .send_req = virtio_fs_send_req, + .release = virtio_fs_fiq_release, }; static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) From cc89bf3216f9946e58878f53494e4dff619901f0 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:05 +0100 Subject: [PATCH 098/161] fuse: {io-uring} Make fuse_dev_queue_{interrupt,forget} non-static These functions are also needed by fuse-over-io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit ba74ba571189668697a8d8da906ad6d44762ebc6) (cherry picked from commit fa04c7d7382cbb6dfe1346c6dec75c0e129df170) (cherry picked from commit de941b94c0c8972aef339c13eeaa98c750b4913e) --- fs/fuse/dev.c | 5 +++-- fs/fuse/fuse_dev_i.h | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4ea871b89afb62..8e5cefd43b8c3a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -215,7 +215,8 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } -static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget) { spin_lock(&fiq->lock); if (fiq->connected) { @@ -228,7 +229,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li } } -static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (list_empty(&req->intr_entry)) { diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index b64ab84cbc0d51..3b2bfe1248d357 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -16,6 +16,8 @@ struct fuse_arg; struct fuse_args; struct fuse_pqueue; struct fuse_req; +struct fuse_iqueue; +struct fuse_forget_link; struct fuse_copy_state { int write; @@ -56,6 +58,9 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, int zeroing); int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned int nbytes); +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget); +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); #endif From ef142212882f3fa02adfbea374890eb3bd61e3e7 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:06 +0100 Subject: [PATCH 099/161] fuse: Allow to queue fg requests through io-uring This prepares queueing and sending foreground requests through io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit c2c9af9a0b13261c36909036057a116f2edb5e1a) (cherry picked from commit d9c10de4e60863962ddfbd65ac1e669836ee53e2) (cherry picked from commit 16ae04675e0528e31a016ce267137cc964d23e21) --- fs/fuse/dev_uring.c | 180 ++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 8 ++ 2 files changed, 188 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 9ba1c132ebdc1e..86d5dfdac9ace3 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -25,6 +25,29 @@ bool fuse_uring_enabled(void) return enable_uring; } +struct fuse_uring_pdu { + struct fuse_ring_ent *ent; +}; + +static const struct fuse_iqueue_ops fuse_io_uring_ops; + +static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_ent *ring_ent) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + pdu->ent = ring_ent; +} + +static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + return pdu->ent; +} + static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, int error) { @@ -788,6 +811,31 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } +static bool is_ring_ready(struct fuse_ring *ring, int current_qid) +{ + int qid; + struct fuse_ring_queue *queue; + bool ready = true; + + for (qid = 0; qid < ring->nr_queues && ready; qid++) { + if (current_qid == qid) + continue; + + queue = ring->queues[qid]; + if (!queue) { + ready = false; + break; + } + + spin_lock(&queue->lock); + if (list_empty(&queue->ent_avail_queue)) + ready = false; + spin_unlock(&queue->lock); + } + + return ready; +} + /* * fuse_uring_req_fetch command handling */ @@ -796,11 +844,23 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, unsigned int issue_flags) { struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + struct fuse_iqueue *fiq = &fc->iq; spin_lock(&queue->lock); ent->cmd = cmd; fuse_uring_ent_avail(ent, queue); spin_unlock(&queue->lock); + + if (!ring->ready) { + bool ready = is_ring_ready(ring, queue->qid); + + if (ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + } + } } /* @@ -984,3 +1044,123 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; } + +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + +/* + * This prepares and sends the ring request in fuse-uring task context. + * User buffers are not mapped yet - the application does not have permission + * to write to it - this has to be executed in ring task context. + */ +static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue = ent->queue; + int err; + + if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + err = fuse_uring_prepare_send(ent, ent->fuse_req); + if (err) { + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return; + } + } else { + err = -ECANCELED; + } + + fuse_uring_send(ent, cmd, err, issue_flags); +} + +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +{ + unsigned int qid; + struct fuse_ring_queue *queue; + + qid = task_cpu(current); + + if (WARN_ONCE(qid >= ring->nr_queues, + "Core number (%u) exceeds nr queues (%zu)\n", qid, + ring->nr_queues)) + qid = 0; + + queue = ring->queues[qid]; + WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + + return queue; +} + +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +{ + struct io_uring_cmd *cmd = ent->cmd; + + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); +} + +/* queue a fuse request and send it if a ring entry is available */ +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + int err; + + err = -EINVAL; + queue = fuse_uring_task_to_queue(ring); + if (!queue) + goto err; + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + spin_lock(&queue->lock); + err = -ENOTCONN; + if (unlikely(queue->stopped)) + goto err_unlock; + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + if (ent) + fuse_uring_add_req_to_ring_ent(ent, req); + else + list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); + + if (ent) + fuse_uring_dispatch_ent(ent); + + return; + +err_unlock: + spin_unlock(&queue->lock); +err: + req->out.h.error = err; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); +} + +static const struct fuse_iqueue_ops fuse_io_uring_ops = { + /* should be send over io-uring as enhancement */ + .send_forget = fuse_dev_queue_forget, + + /* + * could be send over io-uring, but interrupts should be rare, + * no need to make the code complex + */ + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_uring_queue_fuse_req, +}; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index a4316e118cbd80..0517a6eafc9173 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -117,6 +117,8 @@ struct fuse_ring { unsigned long teardown_time; atomic_t queue_refs; + + bool ready; }; bool fuse_uring_enabled(void); @@ -124,6 +126,7 @@ void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -147,6 +150,11 @@ static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) atomic_read(&ring->queue_refs) == 0); } +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return fc->ring && fc->ring->ready; +} + #else /* CONFIG_FUSE_IO_URING */ struct fuse_ring; From e5738db2db9ea04f41bf4817aa3803424e72ff90 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 17 Apr 2025 23:32:04 +0200 Subject: [PATCH 100/161] RED-ONLY: fuse: {io-uring red} define IO_URING_F_TASK_DEAD We need to define this ourself, as Ubuntu noble misses the backport. Also define io_uring_cmd_to_pdu(), which is also missing. (cherry picked from commit 018641abafa846b01c46e990aa25bdbd9ee97bc3) (cherry picked from commit 93d1848a1fa74aa885c6d065806f4a1e8461663f) --- fs/fuse/dev_uring.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 86d5dfdac9ace3..5b6442b486ebdb 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -19,6 +19,20 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_URING_IOV_SEGS 2 /* header and payload */ +/* redfs only to allow patch backports */ +#define IO_URING_F_TASK_DEAD (1 << 13) + +#ifndef io_uring_cmd_to_pdu +static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) +{ + BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu)); +} +/* red specific backport */ +#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \ + io_uring_cmd_private_sz_check(sizeof(pdu_type)), \ + ((pdu_type *)&(cmd)->pdu) \ +) +#endif bool fuse_uring_enabled(void) { From 27031d4f4200703ffb549dbf64fec5dece91e499 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:07 +0100 Subject: [PATCH 101/161] fuse: Allow to queue bg requests through io-uring This prepares queueing and sending background requests through io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 857b0263f30eebe13ab4b6a65156a0d6c8fc2210) (cherry picked from commit 6da52b72e4ef3fb29087b577d9765cc1f4acbb4b) (cherry picked from commit 5ef3f0c8cae4c4f9bd4cba6996612df1926e988e) --- fs/fuse/dev.c | 26 +++++++++++- fs/fuse/dev_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 12 ++++++ 3 files changed, 136 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8e5cefd43b8c3a..6488fae709c85b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -535,7 +535,25 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) return ret; } -static bool fuse_request_queue_background(struct fuse_req *req) +#ifdef CONFIG_FUSE_IO_URING +static bool fuse_request_queue_background_uring(struct fuse_conn *fc, + struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &fc->iq; + + req->in.h.unique = fuse_get_unique(fiq); + req->in.h.len = sizeof(struct fuse_in_header) + + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); + + return fuse_uring_queue_bq_req(req); +} +#endif + +/* + * @return true if queued + */ +static int fuse_request_queue_background(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; @@ -547,6 +565,12 @@ static bool fuse_request_queue_background(struct fuse_req *req) atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); + +#ifdef CONFIG_FUSE_IO_URING + if (fuse_uring_ready(fc)) + return fuse_request_queue_background_uring(fc, req); +#endif + spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 5b6442b486ebdb..04e0b8d2da4cf3 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -62,10 +62,53 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } +static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_held(&queue->lock); + lockdep_assert_held(&fc->bg_lock); + + /* + * Allow one bg request per queue, ignoring global fc limits. + * This prevents a single queue from consuming all resources and + * eliminates the need for remote queue wake-ups when global + * limits are met but this queue has no more waiting requests. + */ + while ((fc->active_background < fc->max_background || + !queue->active_background) && + (!list_empty(&queue->fuse_req_bg_queue))) { + struct fuse_req *req; + + req = list_first_entry(&queue->fuse_req_bg_queue, + struct fuse_req, list); + fc->active_background++; + queue->active_background++; + + list_move_tail(&req->list, &queue->fuse_req_queue); + } +} + static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, int error) { + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_not_held(&queue->lock); + spin_lock(&queue->lock); ent->fuse_req = NULL; + if (test_bit(FR_BACKGROUND, &req->flags)) { + queue->active_background--; + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + } + + spin_unlock(&queue->lock); + if (error) req->out.h.error = error; @@ -93,6 +136,7 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) { int qid; struct fuse_ring_queue *queue; + struct fuse_conn *fc = ring->fc; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); @@ -100,6 +144,13 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) continue; queue->stopped = true; + + WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); + spin_lock(&queue->lock); + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + spin_unlock(&queue->lock); fuse_uring_abort_end_queue_requests(queue); } } @@ -205,6 +256,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, INIT_LIST_HEAD(&queue->ent_w_req_queue); INIT_LIST_HEAD(&queue->ent_in_userspace); INIT_LIST_HEAD(&queue->fuse_req_queue); + INIT_LIST_HEAD(&queue->fuse_req_bg_queue); queue->fpq.processing = pq; fuse_pqueue_init(&queue->fpq); @@ -1167,6 +1219,53 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) fuse_request_end(req); } +bool fuse_uring_queue_bq_req(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + + queue = fuse_uring_task_to_queue(ring); + if (!queue) + return false; + + spin_lock(&queue->lock); + if (unlikely(queue->stopped)) { + spin_unlock(&queue->lock); + return false; + } + + list_add_tail(&req->list, &queue->fuse_req_bg_queue); + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + spin_lock(&fc->bg_lock); + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + + /* + * Due to bg_queue flush limits there might be other bg requests + * in the queue that need to be handled first. Or no further req + * might be available. + */ + req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, + list); + if (ent && req) { + fuse_uring_add_req_to_ring_ent(ent, req); + spin_unlock(&queue->lock); + + fuse_uring_dispatch_ent(ent); + } else { + spin_unlock(&queue->lock); + } + + return true; +} + static const struct fuse_iqueue_ops fuse_io_uring_ops = { /* should be send over io-uring as enhancement */ .send_forget = fuse_dev_queue_forget, diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 0517a6eafc9173..0182be61778b26 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -82,8 +82,13 @@ struct fuse_ring_queue { /* fuse requests waiting for an entry slot */ struct list_head fuse_req_queue; + /* background fuse requests */ + struct list_head fuse_req_bg_queue; + struct fuse_pqueue fpq; + unsigned int active_background; + bool stopped; }; @@ -127,6 +132,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring); void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_uring_queue_bq_req(struct fuse_req *req); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -179,6 +185,12 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) { } + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return false; +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ From c4ca6860a292eabac6a6b66d256f1342c5c65af5 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:08 +0100 Subject: [PATCH 102/161] fuse: {io-uring} Prevent mount point hang on fuse-server termination When the fuse-server terminates while the fuse-client or kernel still has queued URING_CMDs, these commands retain references to the struct file used by the fuse connection. This prevents fuse_dev_release() from being invoked, resulting in a hung mount point. This patch addresses the issue by making queued URING_CMDs cancelable, allowing fuse_dev_release() to proceed as expected and preventing the mount point from hanging. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit b6236c8407cba5d7a108facb1bcfab24994d3814) (cherry picked from commit 144266f4e2a6e483c5ed0b3575c2151d1c945519) (cherry picked from commit 8f3441f36b8fb0115e5312c1e9bd7a73b57be9f4) --- fs/fuse/dev_uring.c | 81 ++++++++++++++++++++++++++++++++++++++++--- fs/fuse/dev_uring_i.h | 9 +++++ 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 04e0b8d2da4cf3..6880f0f7f1ade4 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -14,7 +14,7 @@ static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); -MODULE_PARM_DESC(enable_uring, +MODULE_PARM_DESC(enable_uring_dangerous, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ @@ -165,6 +165,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; + struct fuse_ring_ent *ent, *next; if (!queue) continue; @@ -174,6 +175,12 @@ void fuse_uring_destruct(struct fuse_conn *fc) WARN_ON(!list_empty(&queue->ent_commit_queue)); WARN_ON(!list_empty(&queue->ent_in_userspace)); + list_for_each_entry_safe(ent, next, &queue->ent_released, + list) { + list_del_init(&ent->list); + kfree(ent); + } + kfree(queue->fpq.processing); kfree(queue); ring->queues[qid] = NULL; @@ -257,6 +264,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, INIT_LIST_HEAD(&queue->ent_in_userspace); INIT_LIST_HEAD(&queue->fuse_req_queue); INIT_LIST_HEAD(&queue->fuse_req_bg_queue); + INIT_LIST_HEAD(&queue->ent_released); queue->fpq.processing = pq; fuse_pqueue_init(&queue->fpq); @@ -304,6 +312,15 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) /* remove entry from queue->fpq->processing */ list_del_init(&req->list); } + + /* + * The entry must not be freed immediately, due to access of direct + * pointer access of entries through IO_URING_F_CANCEL - there is a risk + * of race between daemon termination (which triggers IO_URING_F_CANCEL + * and accesses entries without checking the list state first + */ + list_move(&ent->list, &queue->ent_released); + ent->state = FRRS_RELEASED; spin_unlock(&queue->lock); if (cmd) @@ -311,9 +328,6 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) if (req) fuse_uring_stop_fuse_req_end(req); - - list_del_init(&ent->list); - kfree(ent); } static void fuse_uring_stop_list_entries(struct list_head *head, @@ -333,6 +347,7 @@ static void fuse_uring_stop_list_entries(struct list_head *head, continue; } + ent->state = FRRS_TEARDOWN; list_move(&ent->list, &to_teardown); } spin_unlock(&queue->lock); @@ -447,6 +462,50 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) } } +#ifdef IO_URING_F_CANCEL +/* + * Handle IO_URING_F_CANCEL, typically should come on daemon termination. + * + * Releasing the last entry should trigger fuse_dev_release() if + * the daemon was terminated + */ +static void fuse_uring_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue; + bool need_cmd_done = false; + + /* + * direct access on ent - it must not be destructed as long as + * IO_URING_F_CANCEL might come up + */ + queue = ent->queue; + spin_lock(&queue->lock); + if (ent->state == FRRS_AVAILABLE) { + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + need_cmd_done = true; + ent->cmd = NULL; + } + spin_unlock(&queue->lock); + + if (need_cmd_done) { + /* no queue lock to avoid lock order issues */ + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + } +} +#endif + +#ifdef IO_URING_F_CANCEL +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_ring_ent *ring_ent) +{ + uring_cmd_set_ring_ent(cmd, ring_ent); + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} +#endif + /* * Checks for errors and stores it into the request */ @@ -865,6 +924,9 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, spin_unlock(&queue->lock); /* without the queue lock, as other locks are taken */ +#ifdef IO_URING_F_CANCEL + fuse_uring_prepare_cancel(cmd, issue_flags, ent); +#endif fuse_uring_commit(ent, req, issue_flags); /* @@ -914,6 +976,10 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, struct fuse_conn *fc = ring->fc; struct fuse_iqueue *fiq = &fc->iq; +#ifdef IO_URING_F_CANCEL + fuse_uring_prepare_cancel(cmd, issue_flags, ent); +#endif + spin_lock(&queue->lock); ent->cmd = cmd; fuse_uring_ent_avail(ent, queue); @@ -1064,6 +1130,13 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, return -EOPNOTSUPP; } +#ifdef IO_URING_F_CANCEL + if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { + fuse_uring_cancel(cmd, issue_flags); + return 0; + } +#endif + /* This extra SQE size holds struct fuse_uring_cmd_req */ if (!(issue_flags & IO_URING_F_SQE128)) return -EINVAL; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 0182be61778b26..2102b3d0c1aed1 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -28,6 +28,12 @@ enum fuse_ring_req_state { /* The ring entry is in or on the way to user space */ FRRS_USERSPACE, + + /* The ring entry is in teardown */ + FRRS_TEARDOWN, + + /* The ring entry is released, but not freed yet */ + FRRS_RELEASED, }; /** A fuse ring entry, part of the ring queue */ @@ -79,6 +85,9 @@ struct fuse_ring_queue { /* entries in userspace */ struct list_head ent_in_userspace; + /* entries that are released */ + struct list_head ent_released; + /* fuse requests waiting for an entry slot */ struct list_head fuse_req_queue; From 14ba960dd4136196a37693c114fde5ec9ccb40ee Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sat, 8 Nov 2025 14:06:55 +0100 Subject: [PATCH 103/161] fuse: {io-uring} exit task work workaround for missing IO_URING_F_CANCEL in RHEL 9.4 --- fs/fuse/dev_uring.c | 74 ++++++++++++++++++++++++++++++++++++++++++- fs/fuse/dev_uring_i.h | 3 ++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 6880f0f7f1ade4..cc5e2c6687794e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -14,7 +14,7 @@ static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); -MODULE_PARM_DESC(enable_uring_dangerous, +MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ @@ -163,6 +163,23 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!ring) return; + /* Cleanup daemon monitoring */ + if (ring->daemon_pid) { + struct pid *pid; + + /* + * Atomically clear daemon_pid. If callback runs concurrently, + * only one of us (callback or destruct) will see non-NULL. + */ + pid = xchg(&ring->daemon_pid, NULL); + if (pid) { + remove_wait_queue(&pid->wait_pidfd, + &ring->daemon_exit_wait); + put_pid(pid); + fuse_conn_put(ring->fc); + } + } + for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -191,6 +208,59 @@ void fuse_uring_destruct(struct fuse_conn *fc) fc->ring = NULL; } +static int fuse_daemon_exit_callback(wait_queue_entry_t *wait, unsigned mode, + int sync, void *key) +{ + struct fuse_ring *ring = + container_of(wait, struct fuse_ring, daemon_exit_wait); + struct pid *pid; + + /* + * Atomically clear daemon_pid. If already NULL, fuse_uring_destruct() + * won the race and we must not access ring->fc (might be freed). + */ + pid = xchg(&ring->daemon_pid, NULL); + if (!pid) + return 0; + + /* Daemon is exiting - abort connection and drop reference */ + fuse_abort_conn(ring->fc); + fuse_conn_put(ring->fc); + put_pid(pid); + + return 0; +} + +/* During daemon registration */ +static int fuse_register_daemon(struct fuse_ring *ring) +{ + struct pid *pid; + + pid = get_task_pid(current, PIDTYPE_TGID); + if (!pid) + return -ESRCH; + + /* + * Take reference on fc. Dropped by either: + * - Callback when daemon exits, or + * - fuse_uring_destruct() if callback hasn't run + */ + fuse_conn_get(ring->fc); + + ring->daemon_pid = pid; + + /* Setup wait queue entry */ + ring->daemon_exit_wait.flags = 0; + ring->daemon_exit_wait.private = NULL; + ring->daemon_exit_wait.func = fuse_daemon_exit_callback; + INIT_LIST_HEAD(&ring->daemon_exit_wait.entry); + + /* Add to pid's wait queue - woken when process exits */ + add_wait_queue(&pid->wait_pidfd, &ring->daemon_exit_wait); + + return 0; +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -229,6 +299,8 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) ring->max_payload_sz = max_payload_size; atomic_set(&ring->queue_refs, 0); + fuse_register_daemon(ring); + spin_unlock(&fc->lock); return ring; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 2102b3d0c1aed1..39ddedb459e6cf 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -132,6 +132,9 @@ struct fuse_ring { atomic_t queue_refs; + struct pid *daemon_pid; + wait_queue_entry_t daemon_exit_wait; + bool ready; }; From 11400458411c4e8e0685ad27f2eb68a091dd7d77 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:09 +0100 Subject: [PATCH 104/161] fuse: block request allocation until io-uring init is complete Avoid races and block request allocation until io-uring queues are ready. This is a especially important for background requests, as bg request completion might cause lock order inversion of the typical queue->lock and then fc->bg_lock fuse_request_end spin_lock(&fc->bg_lock); flush_bg_queue fuse_send_one fuse_uring_queue_fuse_req spin_lock(&queue->lock); Signed-off-by: Bernd Schubert Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 3393ff964e0fa5def66570c54a4612bf9df06b76) (cherry picked from commit dc40b25872f3cafddc03063da458a1ce72f5dfb5) (cherry picked from commit 0341c158eafd096f94ad9803aeab3aabcbb6ff4a) --- fs/fuse/dev.c | 3 ++- fs/fuse/dev_uring.c | 3 +++ fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 2 ++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6488fae709c85b..3344a70ef60b57 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -73,7 +73,8 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { - return !fc->initialized || (for_background && fc->blocked); + return !fc->initialized || (for_background && fc->blocked) || + (fc->io_uring && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index cc5e2c6687794e..43302347496102 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1063,6 +1063,7 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, if (ready) { WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); } } } @@ -1238,6 +1239,8 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, if (err) { pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", err); + fc->io_uring = 0; + wake_up_all(&fc->blocked_waitq); return err; } break; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b949779a06783e..97472a9ca0bdb8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -825,6 +825,9 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /* Use io_uring for communication */ + unsigned int io_uring; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 34adf038b492f1..5123d51fa65267 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1306,6 +1306,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; + if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) + fc->io_uring = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; From aa8240136d183664a0b2cebc468210beea9b5ba8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:10 +0100 Subject: [PATCH 105/161] fuse: enable fuse-over-io-uring All required parts are handled now, fuse-io-uring can be enabled. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 786412a73e7ee5b00ef3437bbf2f3a250759b2ae) (cherry picked from commit 5174768caa12501e9b41cabe842ba0d953605587) (cherry picked from commit 05da1ea7ec7b045d53ce231194b9ed3b9c3768f0) --- fs/fuse/dev.c | 3 +++ fs/fuse/dev_uring.c | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3344a70ef60b57..4400eba82491e6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2337,6 +2337,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#ifdef CONFIG_FUSE_IO_URING + .uring_cmd = fuse_uring_cmd, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 43302347496102..eeff2271ea0ec8 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1190,8 +1190,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, * Entry function from io_uring to handle the given passthrough command * (op code IORING_OP_URING_CMD) */ -int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, - unsigned int issue_flags) +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct fuse_dev *fud; struct fuse_conn *fc; From 5fa993778a596c877d7052cc2d6a1fc4b9879ec1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 23 Jan 2025 17:55:32 +0100 Subject: [PATCH 106/161] fuse: prevent disabling io-uring on active connections The enable_uring module parameter allows administrators to enable/disable io-uring support for FUSE at runtime. However, disabling io-uring while connections already have it enabled can lead to an inconsistent state. Fix this by keeping io-uring enabled on connections that were already using it, even if the module parameter is later disabled. This ensures active FUSE mounts continue to function correctly. Signed-off-by: Bernd Schubert Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi (cherry picked from commit 2d4fde59fd502a65c1698b61ad4d0f10a9ab665a) (cherry picked from commit ac738c15aad070446c3739b979113848803a173f) (cherry picked from commit a1bb4cefd2c061f0e46529f99d54f6361e9a688f) --- fs/fuse/dev_uring.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index eeff2271ea0ec8..65b573824c1efd 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1197,11 +1197,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) u32 cmd_op = cmd->cmd_op; int err; - if (!enable_uring) { - pr_info_ratelimited("fuse-io-uring is disabled\n"); - return -EOPNOTSUPP; - } - #ifdef IO_URING_F_CANCEL if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { fuse_uring_cancel(cmd, issue_flags); @@ -1220,6 +1215,12 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) } fc = fud->fc; + /* Once a connection has io-uring enabled on it, it can't be disabled */ + if (!enable_uring && !fc->io_uring) { + pr_info_ratelimited("fuse-io-uring is disabled\n"); + return -EOPNOTSUPP; + } + if (fc->aborted) return -ECONNABORTED; if (!fc->connected) From 91c033eb62bd66d89408c07132d5a5b84ef39931 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 17 Mar 2025 17:30:28 -0700 Subject: [PATCH 107/161] fuse: fix uring race condition for null dereference of fc There is a race condition leading to a kernel crash from a null dereference when attemping to access fc->lock in fuse_uring_create_queue(). fc may be NULL in the case where another thread is creating the uring in fuse_uring_create() and has set fc->ring but has not yet set ring->fc when fuse_uring_create_queue() reads ring->fc. There is another race condition as well where in fuse_uring_register(), ring->nr_queues may still be 0 and not yet set to the new value when we compare qid against it. This fix sets fc->ring only after ring->fc and ring->nr_queues have been set, which guarantees now that ring->fc is a proper pointer when any queues are created and ring->nr_queues reflects the right number of queues if ring is not NULL. We must use smp_store_release() and smp_load_acquire() semantics to ensure the ordering will remain correct where fc->ring is assigned only after ring->fc and ring->nr_queues have been assigned. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20250318003028.3330599-1-joannelkoong@gmail.com Fixes: 24fe962c86f5 ("fuse: {io-uring} Handle SQEs - register commands") Acked-by: Miklos Szeredi Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner (cherry picked from commit d9ecc77193cad25402ff5517fb26fb22b4db0e10) (cherry picked from commit 7cf1540462760e7582425246cac6715d07597a3c) (cherry picked from commit 0a4cffae7713fec7b52214e34972b833fbd6d1a6) --- fs/fuse/dev_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 65b573824c1efd..bf32e40a19e634 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -293,11 +293,11 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - fc->ring = ring; ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; atomic_set(&ring->queue_refs, 0); + smp_store_release(&fc->ring, ring); fuse_register_daemon(ring); @@ -1147,7 +1147,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, unsigned int issue_flags, struct fuse_conn *fc) { const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); - struct fuse_ring *ring = fc->ring; + struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; int err; From c3b15632dee0c51111fd8e46eca0479748dc6ae1 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 6 Mar 2025 11:12:18 +0000 Subject: [PATCH 108/161] fuse: fix possible deadlock if rings are never initialized When mounting a user-space filesystem using io_uring, the initialization of the rings is done separately in the server side. If for some reason (e.g. a server bug) this step is not performed it will be impossible to unmount the filesystem if there are already requests waiting. This issue is easily reproduced with the libfuse passthrough_ll example, if the queue depth is set to '0' and a request is queued before trying to unmount the filesystem. When trying to force the unmount, fuse_abort_conn() will try to wake up all tasks waiting in fc->blocked_waitq, but because the rings were never initialized, fuse_uring_ready() will never return 'true'. Fixes: 3393ff964e0f ("fuse: block request allocation until io-uring init is complete") Signed-off-by: Luis Henriques Link: https://lore.kernel.org/r/20250306111218.13734-1-luis@igalia.com Acked-by: Miklos Szeredi Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner (cherry picked from commit d55011469b41d9da6c06cb1c4a4da7a87fe155bc) (cherry picked from commit 3a7a4e73e7077c20c943e30dd3fc6c24e736a227) (cherry picked from commit eb41568a7005c82937951b72425f74406557812e) --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4400eba82491e6..8ce8dc10283c5d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -74,7 +74,7 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { return !fc->initialized || (for_background && fc->blocked) || - (fc->io_uring && !fuse_uring_ready(fc)); + (fc->io_uring && fc->connected && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) From 8ba1bea22926e8b5e1745c1dc313f4c9e0735bd5 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 25 Mar 2025 18:29:31 +0100 Subject: [PATCH 109/161] fuse: {io-uring} Fix a possible req cancellation race task-A (application) might be in request_wait_answer and try to remove the request when it has FR_PENDING set. task-B (a fuse-server io-uring task) might handle this request with FUSE_IO_URING_CMD_COMMIT_AND_FETCH, when fetching the next request and accessed the req from the pending list in fuse_uring_ent_assign_req(). That code path was not protected by fiq->lock and so might race with task-A. For scaling reasons we better don't use fiq->lock, but add a handler to remove canceled requests from the queue. This also removes usage of fiq->lock from fuse_uring_add_req_to_ring_ent() altogether, as it was there just to protect against this race and incomplete. Also added is a comment why FR_PENDING is not cleared. Fixes: c090c8abae4b ("fuse: Add io-uring sqe commit and fetch support") Cc: # v6.14 Reported-by: Joanne Koong Closes: https://lore.kernel.org/all/CAJnrk1ZgHNb78dz-yfNTpxmW7wtT88A=m-zF0ZoLXKLUHRjNTw@mail.gmail.com/ Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi (cherry picked from commit 09098e62e4be8f0755e58d6078aaf27cbd9a3a8d) (cherry picked from commit bbe104140373b1b2a53a11c9e7facf115fc26ac9) (cherry picked from commit e82d77d9373b8608d11eeed5caa012010634ec72) --- fs/fuse/dev.c | 34 +++++++++++++++++++++++++--------- fs/fuse/dev_uring.c | 15 +++++++++++---- fs/fuse/dev_uring_i.h | 6 ++++++ fs/fuse/fuse_dev_i.h | 1 + fs/fuse/fuse_i.h | 3 +++ 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8ce8dc10283c5d..561c94e2aaa442 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -382,6 +382,24 @@ static int queue_interrupt(struct fuse_req *req) return 0; } +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock) +{ + spin_lock(lock); + if (test_bit(FR_PENDING, &req->flags)) { + /* + * FR_PENDING does not get cleared as the request will end + * up in destruction anyway. + */ + list_del(&req->list); + spin_unlock(lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return true; + } + spin_unlock(lock); + return false; +} + static void request_wait_answer(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; @@ -403,22 +421,20 @@ static void request_wait_answer(struct fuse_req *req) } if (!test_bit(FR_FORCE, &req->flags)) { + bool removed; + /* Only fatal signals may interrupt this */ err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; - spin_lock(&fiq->lock); - /* Request is not yet in userspace, bail out */ - if (test_bit(FR_PENDING, &req->flags)) { - list_del(&req->list); - spin_unlock(&fiq->lock); - __fuse_put_request(req); - req->out.h.error = -EINTR; + if (test_bit(FR_URING, &req->flags)) + removed = fuse_uring_remove_pending_req(req); + else + removed = fuse_remove_pending_req(req, &fiq->lock); + if (removed) return; - } - spin_unlock(&fiq->lock); } /* diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index bf32e40a19e634..edd5143b683feb 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -828,8 +828,6 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; - struct fuse_conn *fc = req->fm->fc; - struct fuse_iqueue *fiq = &fc->iq; lockdep_assert_held(&queue->lock); @@ -839,9 +837,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, ent->state); } - spin_lock(&fiq->lock); clear_bit(FR_PENDING, &req->flags); - spin_unlock(&fiq->lock); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; list_move(&ent->list, &queue->ent_w_req_queue); @@ -1346,6 +1342,8 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (unlikely(queue->stopped)) goto err_unlock; + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); if (ent) @@ -1384,6 +1382,8 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) return false; } + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); ent = list_first_entry_or_null(&queue->ent_avail_queue, @@ -1414,6 +1414,13 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) return true; } +bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + struct fuse_ring_queue *queue = req->ring_queue; + + return fuse_remove_pending_req(req, &queue->lock); +} + static const struct fuse_iqueue_ops fuse_io_uring_ops = { /* should be send over io-uring as enhancement */ .send_forget = fuse_dev_queue_forget, diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 39ddedb459e6cf..9eb60d91e264bb 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -145,6 +145,7 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); +bool fuse_uring_remove_pending_req(struct fuse_req *req); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -203,6 +204,11 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc) return false; } +static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + return false; +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 3b2bfe1248d357..2481da3388c5fe 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -61,6 +61,7 @@ int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget); void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); #endif diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 97472a9ca0bdb8..8b2a23f94b5f82 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -339,6 +339,7 @@ struct fuse_io_priv { * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list * FR_ASYNC: request is asynchronous + * FR_URING: request is handled through fuse-io-uring */ enum fuse_req_flag { FR_ISREPLY, @@ -353,6 +354,7 @@ enum fuse_req_flag { FR_FINISHED, FR_PRIVATE, FR_ASYNC, + FR_URING, }; /** @@ -402,6 +404,7 @@ struct fuse_req { #ifdef CONFIG_FUSE_IO_URING void *ring_entry; + void *ring_queue; #endif }; From dbd51c5c777bc55ddc1e651a4e0127b2e8038b1b Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 7 Feb 2025 13:35:02 +0000 Subject: [PATCH 110/161] fuse: removed unused function fuse_uring_create() from header Function fuse_uring_create() is used only from dev_uring.c and does not need to be exposed in the header file. Furthermore, it has the wrong signature. While there, also remove the 'struct fuse_ring' forward declaration. Signed-off-by: Luis Henriques Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 841c7b812c038661e4f659d1b9c9a366c6d24b71) (cherry picked from commit 54d9a86e217e3bcec53a2dce423a930f2ff83eac) (cherry picked from commit a7050b7596f15cca3c62a75e427a590c51a99cb0) --- fs/fuse/dev_uring_i.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 9eb60d91e264bb..d33f8f11258143 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -176,12 +176,6 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc) #else /* CONFIG_FUSE_IO_URING */ -struct fuse_ring; - -static inline void fuse_uring_create(struct fuse_conn *fc) -{ -} - static inline void fuse_uring_destruct(struct fuse_conn *fc) { } From f63c2fe33018c8c7d80b4915527fc2dc56af422f Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 16 Dec 2024 22:14:06 +0100 Subject: [PATCH 111/161] fuse: Allocate only namelen buf memory in fuse_notify_ fuse_notify_inval_entry and fuse_notify_delete were using fixed allocations of FUSE_NAME_MAX to hold the file name. Often that large buffers are not needed as file names might be smaller, so this uses the actual file name size to do the allocation. Signed-off-by: Bernd Schubert Reviewed-by: Jingbo Xu Signed-off-by: Miklos Szeredi (cherry picked from commit 2412085da370836945c2daa61c5cee38dd979e0d) (cherry picked from commit 65e7afaa84b90456db4567660485885cc40de667) (cherry picked from commit b80de601a854ef6f7dfea58a5df9f28a0d137225) --- fs/fuse/dev.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 561c94e2aaa442..fe51971c1bb080 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1500,14 +1500,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1524,6 +1520,11 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1548,14 +1549,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_delete_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1572,6 +1569,11 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); From fcad7f120d54879ee53027a145b74fcd111af865 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 112/161] fuse: use FUSE_ROOT_ID in fuse_get_root_inode() ...when calling fuse_iget(). Signed-off-by: Miklos Szeredi (cherry picked from commit 253e52437119719eb87293ef6852e13bb5ad0960) (cherry picked from commit 4f1ba11fbb5949a0fe94d20861b1ef391730d8c1) (cherry picked from commit 55f680452722e31ea79a39d6f31641fcb956f586) --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5123d51fa65267..576023184eae62 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -989,7 +989,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); } struct fuse_inode_handle { From b1d2e49a392bf26f1e760b5b6b9d8a0909681e7c Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Mon, 18 Nov 2024 18:16:00 +0800 Subject: [PATCH 113/161] fuse: check attributes staleness on fuse_iget() Function fuse_direntplus_link() might call fuse_iget() to initialize a new fuse_inode and change its attributes. If fi->attr_version is always initialized with 0, even if the attributes returned by the FUSE_READDIR request is staled, as the new fi->attr_version is 0, fuse_change_attributes will still set the staled attributes to inode. This wrong behaviour may cause file size inconsistency even when there is no changes from server-side. To reproduce the issue, consider the following 2 programs (A and B) are running concurrently, A B ---------------------------------- -------------------------------- { /fusemnt/dir/f is a file path in a fuse mount, the size of f is 0. } readdir(/fusemnt/dir) start //Daemon set size 0 to f direntry fallocate(f, 1024) stat(f) // B see size 1024 echo 2 > /proc/sys/vm/drop_caches readdir(/fusemnt/dir) reply to kernel Kernel set 0 to the I_NEW inode stat(f) // B see size 0 In the above case, only program B is modifying the file size, however, B observes file size changing between the 2 'readonly' stat() calls. To fix this issue, we should make sure readdirplus still follows the rule of attr_version staleness checking even if the fi->attr_version is lost due to inode eviction. To identify this situation, the new fc->evict_ctr is used to record whether the eviction of inodes occurs during the readdirplus request processing. If it does, the result of readdirplus may be inaccurate; otherwise, the result of readdirplus can be trusted. Although this may still lead to incorrect invalidation, considering the relatively low frequency of evict occurrences, it should be acceptable. Link: https://lore.kernel.org/lkml/20230711043405.66256-2-zhangjiachen.jaycee@bytedance.com/ Link: https://lore.kernel.org/lkml/20241114070905.48901-1-zhangtianci.1997@bytedance.com/ Reported-by: Jiachen Zhang Suggested-by: Miklos Szeredi Signed-off-by: Zhang Tianci Signed-off-by: Miklos Szeredi (cherry picked from commit fabde099e332399c6d7d315f989dd0ecf5a2b384) (cherry picked from commit cbbf3a777a9cf62049309b4101caeb48b47a987e) --- fs/fuse/dir.c | 11 +++++----- fs/fuse/fuse_i.h | 14 ++++++++++-- fs/fuse/inode.c | 56 +++++++++++++++++++++++++++++++++++++---------- fs/fuse/readdir.c | 15 ++++++++----- 4 files changed, 71 insertions(+), 25 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 108da66d67c977..973c65af07d913 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -367,7 +367,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; - u64 attr_version; + u64 attr_version, evict_ctr; int err; *inode = NULL; @@ -382,6 +382,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out; attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); @@ -399,7 +400,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), - attr_version); + attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); @@ -685,7 +686,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, ATTR_TIMEOUT(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -813,7 +814,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, ATTR_TIMEOUT(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -1973,7 +1974,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), - fuse_get_cache_mask(inode)); + fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8b2a23f94b5f82..4a00b0c4aed86e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -855,6 +855,9 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Version counter for evict inode */ + atomic64_t evict_ctr; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -962,6 +965,11 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline u64 fuse_get_evict_ctr(struct fuse_conn *fc) +{ + return atomic64_read(&fc->evict_ctr); +} + static inline bool fuse_stale_inode(const struct inode *inode, int generation, struct fuse_attr *attr) { @@ -1021,7 +1029,8 @@ extern const struct dentry_operations fuse_root_dentry_operations; */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version); + u64 attr_valid, u64 attr_version, + u64 evict_ctr); int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode); @@ -1116,7 +1125,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask); + u64 attr_valid, u32 cache_mask, + u64 evict_ctr); u32 fuse_get_cache_mask(struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 576023184eae62..8233431c531388 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -169,6 +169,14 @@ static void fuse_evict_inode(struct inode *inode) fuse_cleanup_submount_lookup(fc, fi->submount_lookup); fi->submount_lookup = NULL; } + /* + * Evict of non-deleted inode may race with outstanding + * LOOKUP/READDIRPLUS requests and result in inconsistency when + * the request finishes. Deal with that here by bumping a + * counter that can be compared to the starting value. + */ + if (inode->i_nlink > 0) + atomic64_inc(&fc->evict_ctr); } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); @@ -201,17 +209,30 @@ static ino_t fuse_squash_ino(u64 ino64) void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask) + u64 attr_valid, u32 cache_mask, + u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); lockdep_assert_held(&fi->lock); + /* + * Clear basic stats from invalid mask. + * + * Don't do this if this is coming from a fuse_iget() call and there + * might have been a racing evict which would've invalidated the result + * if the attr_version would've been preserved. + * + * !evict_ctr -> this is create + * fi->attr_version != 0 -> this is not a new inode + * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request + */ + if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - /* Clear basic stats from invalid mask */ - set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -293,9 +314,9 @@ u32 fuse_get_cache_mask(struct inode *inode) return STATX_MTIME | STATX_CTIME | STATX_SIZE; } -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - struct fuse_statx *sx, - u64 attr_valid, u64 attr_version) +static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -329,7 +350,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, + evict_ctr); oldsize = inode->i_size; /* @@ -370,6 +392,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version) +{ + fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); +} + static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, u64 nodeid) { @@ -419,7 +448,8 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp) struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; @@ -480,8 +510,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, fi->nlookup++; spin_unlock(&fi->lock); done: - fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); - + fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, + evict_ctr); return inode; } @@ -931,6 +961,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; atomic64_set(&fc->attr_version, 1); + atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); @@ -989,7 +1020,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { @@ -1569,7 +1600,8 @@ static int fuse_fill_super_submount(struct super_block *sb, return -ENOMEM; fuse_fill_attr_from_inode(&root_attr, parent_fi); - root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, + fuse_get_evict_ctr(fm->fc)); /* * This inode is just a duplicate, so it is not looked up and * its nlookup should not be incremented. fuse_iget() does diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 9e6d587b3e6731..e77417d6125202 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -149,7 +149,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, - u64 attr_version) + u64 attr_version, u64 evict_ctr) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; @@ -233,7 +233,7 @@ static int fuse_direntplus_link(struct file *file, } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, ATTR_TIMEOUT(o), - attr_version); + attr_version, evict_ctr); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -284,7 +284,8 @@ static void fuse_force_forget(struct file *file, u64 nodeid) } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) + struct dir_context *ctx, u64 attr_version, + u64 evict_ctr) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -319,7 +320,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, buf += reclen; nbytes -= reclen; - ret = fuse_direntplus_link(file, direntplus, attr_version); + ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } @@ -337,7 +338,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_page_desc desc = { .length = PAGE_SIZE }; - u64 attr_version = 0; + u64 attr_version = 0, evict_ctr = 0; bool locked; page = alloc_page(GFP_KERNEL); @@ -351,6 +352,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -368,7 +370,8 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { res = parse_dirplusfile(page_address(page), res, - file, ctx, attr_version); + file, ctx, attr_version, + evict_ctr); } else { res = parse_dirfile(page_address(page), res, file, ctx); From 704c7775012b28520ea03e5589542fdac0278ea9 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 8 Apr 2025 15:14:20 +0200 Subject: [PATCH 114/161] fuse: fuse: Use readdir buffer of max_pages size A readdir buffer of 4K might be just enough to read a single file name at a time - increase the buffer size to the max_pages. Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi Signed-off-by: Bernd Schubert (cherry picked from commit d77db6e6e294cb3d44e6d90ca6683b40f42e6431) (cherry picked from commit bb07468f246c37782d17cf902858c966c035c432) --- fs/fuse/readdir.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index e77417d6125202..49f8f47e39bd2f 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -332,35 +332,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct page *page; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_io_args ia = {}; - struct fuse_args_pages *ap = &ia.ap; - struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_args *args = &ia.ap.args; + void *buf; + size_t bufsize = fc->max_pages << PAGE_SHIFT; u64 attr_version = 0, evict_ctr = 0; bool locked; - page = alloc_page(GFP_KERNEL); - if (!page) + buf = kvmalloc(bufsize, GFP_KERNEL); + if (!buf) return -ENOMEM; + args->out_args[0].value = buf; + plus = fuse_use_readdirplus(inode, ctx); - ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &page; - ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS); } else { - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(fm, args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -369,16 +366,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(page_address(page), res, - file, ctx, attr_version, + res = parse_dirplusfile(buf, res, file, ctx, attr_version, evict_ctr); } else { - res = parse_dirfile(page_address(page), res, file, - ctx); + res = parse_dirfile(buf, res, file, ctx); } } - __free_page(page); + kvfree(buf); fuse_invalidate_atime(inode); return res; } From 3dc9300c26875d84bf95d0cf28bcb1660fd0a4c4 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 17 Apr 2025 15:58:03 +0200 Subject: [PATCH 115/161] Revert "virtiofs: use pages instead of pointer for kernel direct IO" This reverts commit 6996dac8834951514416624e885cb4150d67df07. (cherry picked from commit 1b4cae7776959f70209850aa231b91284da8f01d) (cherry picked from commit 722573a1877c3fc75d89868d09fb1d5f45067331) --- fs/fuse/file.c | 62 ++++++++++++++------------------------------- fs/fuse/fuse_i.h | 6 ----- fs/fuse/virtio_fs.c | 1 - 3 files changed, 19 insertions(+), 50 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d32ca529f8460c..d8380a2948cd50 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -624,7 +624,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, +static void fuse_release_user_pages(struct fuse_args_pages *ap, bool should_dirty) { unsigned int i; @@ -634,9 +634,6 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, set_page_dirty_lock(ap->pages[i]); put_page(ap->pages[i]); } - - if (nres > 0 && ap->args.invalidate_vmap) - invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -735,29 +732,25 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - size_t nres; + + fuse_release_user_pages(&ia->ap, io->should_dirty); if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else { - nres = ia->write.out.size; - if (ia->write.in.size != ia->write.out.size) - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else if (ia->write.in.size != ia->write.out.size) { + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; - nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } - fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); - fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -1397,37 +1390,24 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages, - bool use_pages_for_kvec_io) + unsigned int max_pages) { - bool flush_or_invalidate = false; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer. - * However if the implementation of fuse_conn requires pages instead of - * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. - */ + /* Special case for kernel I/O: can copy directly into the buffer */ if (iov_iter_is_kvec(ii)) { - void *user_addr = (void *)fuse_get_user_addr(ii); - - if (!use_pages_for_kvec_io) { - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - if (write) - ap->args.in_args[1].value = user_addr; - else - ap->args.out_args[0].value = user_addr; - - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; - } + if (write) + ap->args.in_args[1].value = (void *) user_addr; + else + ap->args.out_args[0].value = (void *) user_addr; - if (is_vmalloc_addr(user_addr)) { - ap->args.vmap_base = user_addr; - flush_or_invalidate = true; - } + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; } while (nbytes < *nbytesp && ap->num_pages < max_pages) { @@ -1453,10 +1433,6 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } - if (write && flush_or_invalidate) - flush_kernel_vmap_range(ap->args.vmap_base, nbytes); - - ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.user_pages = true; if (write) ap->args.in_pages = true; @@ -1524,7 +1500,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages, fc->use_pages_for_kvec_io); + max_pages); if (err && !nbytes) break; @@ -1538,7 +1514,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, nres, io->should_dirty); + fuse_release_user_pages(&ia->ap, io->should_dirty); fuse_io_free(ia); } ia = NULL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4a00b0c4aed86e..874ca110e41e47 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -283,12 +283,9 @@ struct fuse_args { bool page_replace:1; bool may_block:1; bool is_ext:1; - bool invalidate_vmap:1; struct fuse_in_arg in_args[4]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); - /* Used for kvec iter backed by vmalloc address */ - void *vmap_base; }; struct fuse_args_pages { @@ -825,9 +822,6 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; - /* Use pages instead of pointer for kernel I/O */ - unsigned int use_pages_for_kvec_io:1; - /* Use io_uring for communication */ unsigned int io_uring; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 27e366e67e29ab..a69fcc0f41d59f 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1432,7 +1432,6 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; - fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, From a78a16612c419bb8d076962308d7c6fe244dd7aa Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 Sep 2024 10:13:11 -0700 Subject: [PATCH 116/161] fuse: enable dynamic configuration of fuse max pages limit (FUSE_MAX_MAX_PAGES) Introduce the capability to dynamically configure the max pages limit (FUSE_MAX_MAX_PAGES) through a sysctl. This allows system administrators to dynamically set the maximum number of pages that can be used for servicing requests in fuse. Previously, this is gated by FUSE_MAX_MAX_PAGES which is statically set to 256 pages. One result of this is that the buffer size for a write request is limited to 1 MiB on a 4k-page system. The default value for this sysctl is the original limit (256 pages). $ sysctl -a | grep max_pages_limit fs.fuse.max_pages_limit = 256 $ sysctl -n fs.fuse.max_pages_limit 256 $ echo 1024 | sudo tee /proc/sys/fs/fuse/max_pages_limit 1024 $ sysctl -n fs.fuse.max_pages_limit 1024 $ echo 65536 | sudo tee /proc/sys/fs/fuse/max_pages_limit tee: /proc/sys/fs/fuse/max_pages_limit: Invalid argument $ echo 0 | sudo tee /proc/sys/fs/fuse/max_pages_limit tee: /proc/sys/fs/fuse/max_pages_limit: Invalid argument $ echo 65535 | sudo tee /proc/sys/fs/fuse/max_pages_limit 65535 $ sysctl -n fs.fuse.max_pages_limit 65535 Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Sweet Tea Dorminy Signed-off-by: Miklos Szeredi (cherry picked from commit 2b3933b1e0a0a4b758fbc164bb31db0c113a7e2c) (cherry picked from commit b02395870f056067f2834dfb7562a6199101627c) (cherry picked from commit e9cf092d9324b618fb3e33b9f6c15e6c7beadfa3) --- Documentation/admin-guide/sysctl/fs.rst | 10 ++++++ fs/fuse/Makefile | 1 + fs/fuse/fuse_i.h | 14 +++++++-- fs/fuse/inode.c | 11 ++++++- fs/fuse/ioctl.c | 2 ++ fs/fuse/sysctl.c | 42 +++++++++++++++++++++++++ 6 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 fs/fuse/sysctl.c diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst index 2a501c9ddc556e..64eaa44face05a 100644 --- a/Documentation/admin-guide/sysctl/fs.rst +++ b/Documentation/admin-guide/sysctl/fs.rst @@ -382,3 +382,13 @@ Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes on a 64bit one. The current default value for max_user_watches is the 1/25 (4%) of the available low memory, divided for the "watch" cost in bytes. + +5. /proc/sys/fs/fuse - Configuration options for FUSE filesystems +===================================================================== + +This directory contains the following configuration options for FUSE +filesystems: + +``/proc/sys/fs/fuse/max_pages_limit`` is a read/write file for +setting/getting the maximum number of pages that can be used for servicing +requests in FUSE. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index ad89ff0d017580..6326a5f9b25c3f 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,5 +10,6 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o +fuse-$(CONFIG_SYSCTL) += sysctl.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 874ca110e41e47..16b38fa9b757e4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -35,9 +35,6 @@ /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 -/** Maximum of max_pages received in init_out */ -#define FUSE_MAX_MAX_PAGES 256 - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -47,6 +44,9 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/** Maximum of max_pages received in init_out */ +extern unsigned int fuse_max_pages_limit; + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -1391,4 +1391,12 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +#ifdef CONFIG_SYSCTL +extern int fuse_sysctl_register(void); +extern void fuse_sysctl_unregister(void); +#else +#define fuse_sysctl_register() (0) +#define fuse_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8233431c531388..5b1e56ded80582 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -35,6 +35,8 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); +unsigned int fuse_max_pages_limit = 256; + unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); @@ -966,7 +968,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_pages_limit = fuse_max_pages_limit; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -2054,8 +2056,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -2066,6 +2074,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 43dba5f81487d6..91a4bf45ddf40e 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,6 +9,8 @@ #include #include +#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 + static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) { diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c new file mode 100644 index 00000000000000..ef2ce2cd464253 --- /dev/null +++ b/fs/fuse/sysctl.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/fuse/fuse_sysctl.c + * + * Sysctl interface to fuse parameters + */ +#include + +#include "fuse_i.h" + +static struct ctl_table_header *fuse_table_header; + +/* Bound by fuse_init_out max_pages, which is a u16 */ +static unsigned int sysctl_fuse_max_pages_limit = 65535; + +static struct ctl_table fuse_sysctl_table[] = { + { + .procname = "max_pages_limit", + .data = &fuse_max_pages_limit, + .maxlen = sizeof(fuse_max_pages_limit), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &sysctl_fuse_max_pages_limit, + }, + /* Fix for RHEL backport */ + {} +}; + +int fuse_sysctl_register(void) +{ + fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table); + if (!fuse_table_header) + return -ENOMEM; + return 0; +} + +void fuse_sysctl_unregister(void) +{ + unregister_sysctl_table(fuse_table_header); + fuse_table_header = NULL; +} From 54f5ef46aec705ff7c09508b26f4404cc80a23e9 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 14:48:40 +0200 Subject: [PATCH 117/161] fuse: Make the fuse_send_one request counter atomic No need to take lock, we can have that in atomic way. fuse-io-uring and virtiofs especially benefit from it as they don't need the fiq lock at all. Signed-off-by: Bernd Schubert (cherry picked from commit 47b26945baaa3a7467de58b573d15ee8283ad81d) (cherry picked from commit c85c801b7ee972805aac4d2666271830bdc6893b) --- fs/fuse/dev.c | 24 +++--------------------- fs/fuse/fuse_dev_i.h | 4 ---- fs/fuse/fuse_i.h | 18 +++++++++++++----- 3 files changed, 16 insertions(+), 30 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fe51971c1bb080..ed31f51a603c78 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -182,24 +182,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq) -{ - fiq->reqctr += FUSE_REQ_ID_STEP; - return fiq->reqctr; -} - -u64 fuse_get_unique(struct fuse_iqueue *fiq) -{ - u64 ret; - - spin_lock(&fiq->lock); - ret = fuse_get_unique_locked(fiq); - spin_unlock(&fiq->lock); - - return ret; -} -EXPORT_SYMBOL_GPL(fuse_get_unique); - unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); @@ -256,7 +238,7 @@ static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) spin_lock(&fiq->lock); if (fiq->connected) { if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); + req->in.h.unique = fuse_get_unique(fiq); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -1144,7 +1126,7 @@ __releases(fiq->lock) struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique_locked(fiq), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1175,7 +1157,7 @@ __releases(fiq->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique_locked(fiq), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 2481da3388c5fe..10836e155e8785 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -8,10 +8,6 @@ #include -/* Ordinary requests have even IDs, while interrupts IDs are odd */ -#define FUSE_INT_REQ_BIT (1ULL << 0) -#define FUSE_REQ_ID_STEP (1ULL << 1) - struct fuse_arg; struct fuse_args; struct fuse_pqueue; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 16b38fa9b757e4..0368994a98746f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,6 +47,10 @@ /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -451,7 +455,7 @@ struct fuse_iqueue { wait_queue_head_t waitq; /** The next unique request id */ - u64 reqctr; + atomic64_t reqctr; /** The list of pending requests */ struct list_head pending; @@ -1012,6 +1016,14 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) rcu_read_unlock(); } +/** + * Get the next unique ID for a request + */ +static inline u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + return atomic64_add_return(FUSE_REQ_ID_STEP, &fiq->reqctr); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -1352,10 +1364,6 @@ int fuse_readdir(struct file *file, struct dir_context *ctx); */ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); -/** - * Get the next unique ID for a request - */ -u64 fuse_get_unique(struct fuse_iqueue *fiq); void fuse_free_conn(struct fuse_conn *fc); /* dax.c */ From e9fc9d253f8cdd458306ad1fd70c9f99a450f1d8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 15:04:40 +0200 Subject: [PATCH 118/161] [RFC] fuse: Set request unique on allocation This is especially needed for better ftrace analysis, for example to build histograms. So far the request unique was missing, because it was added after the first trace message. IDs/req-unique now might not come up perfectly sequentially anymore, but especially with cloned device or io-uring this did not work perfectly anyway. Signed-off-by: Bernd Schubert (cherry picked from commit 44158921dd11a6654a1935dfb2ad8af953f7ada1) (cherry picked from commit 5f44f5174038df1afb7e4d3e01f9c340ecabe85c) --- fs/fuse/dev.c | 8 +++----- fs/fuse/dev_uring.c | 3 --- fs/fuse/virtio_fs.c | 3 --- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed31f51a603c78..506c92e0fe5442 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -237,8 +237,6 @@ static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -493,6 +491,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; if (args->end) __set_bit(FR_ASYNC, &req->flags); + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(&req->fm->fc->iq); } ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) @@ -538,9 +539,6 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) static bool fuse_request_queue_background_uring(struct fuse_conn *fc, struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; - - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index edd5143b683feb..133c5a284203e8 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1334,9 +1334,6 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); - spin_lock(&queue->lock); err = -ENOTCONN; if (unlikely(queue->stopped)) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index a69fcc0f41d59f..12f3be731eb78a 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1216,9 +1216,6 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); - clear_bit(FR_PENDING, &req->flags); fs = fiq->priv; From 59504f7d71cd2d47e54128eedb592d58afa5cc7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 3 Jul 2024 10:38:42 -0400 Subject: [PATCH 119/161] fuse: add simple request tracepoints I've been timing various fuse operations and it's quite annoying to do with kprobes. Add two tracepoints for sending and ending fuse requests to make it easier to debug and time various operations. Signed-off-by: Josef Bacik Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 396b209e405a571ce8e06d3760ffc3e389a944f1) (cherry picked from commit 6e77e0eb47168868ef06760db82ba16ccb3bff1b) (cherry picked from commit bf12f5e6f12b855dff6d22e4138d9177c6bd1f4a) --- fs/fuse/Makefile | 3 + fs/fuse/dev.c | 5 ++ fs/fuse/fuse_trace.h | 132 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 fs/fuse/fuse_trace.h diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6326a5f9b25c3f..6cc00eb7761f60 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -3,6 +3,9 @@ # Makefile for the FUSE filesystem. # +# Needed for trace events +ccflags-y = -I$(src) + obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 506c92e0fe5442..ef7e7ca8791c30 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -24,6 +24,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" + MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -258,6 +261,7 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + trace_fuse_request_send(req); fiq->ops->send_req(fiq, req); } @@ -304,6 +308,7 @@ void fuse_request_end(struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; + trace_fuse_request_end(req); /* * test_and_set_bit() implies smp_mb() between bit * changing and below FR_INTERRUPTED check. Pairs with diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h new file mode 100644 index 00000000000000..bbe9ddd8c71696 --- /dev/null +++ b/fs/fuse/fuse_trace.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fuse + +#if !defined(_TRACE_FUSE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FUSE_H + +#include + +#define OPCODES \ + EM( FUSE_LOOKUP, "FUSE_LOOKUP") \ + EM( FUSE_FORGET, "FUSE_FORGET") \ + EM( FUSE_GETATTR, "FUSE_GETATTR") \ + EM( FUSE_SETATTR, "FUSE_SETATTR") \ + EM( FUSE_READLINK, "FUSE_READLINK") \ + EM( FUSE_SYMLINK, "FUSE_SYMLINK") \ + EM( FUSE_MKNOD, "FUSE_MKNOD") \ + EM( FUSE_MKDIR, "FUSE_MKDIR") \ + EM( FUSE_UNLINK, "FUSE_UNLINK") \ + EM( FUSE_RMDIR, "FUSE_RMDIR") \ + EM( FUSE_RENAME, "FUSE_RENAME") \ + EM( FUSE_LINK, "FUSE_LINK") \ + EM( FUSE_OPEN, "FUSE_OPEN") \ + EM( FUSE_READ, "FUSE_READ") \ + EM( FUSE_WRITE, "FUSE_WRITE") \ + EM( FUSE_STATFS, "FUSE_STATFS") \ + EM( FUSE_RELEASE, "FUSE_RELEASE") \ + EM( FUSE_FSYNC, "FUSE_FSYNC") \ + EM( FUSE_SETXATTR, "FUSE_SETXATTR") \ + EM( FUSE_GETXATTR, "FUSE_GETXATTR") \ + EM( FUSE_LISTXATTR, "FUSE_LISTXATTR") \ + EM( FUSE_REMOVEXATTR, "FUSE_REMOVEXATTR") \ + EM( FUSE_FLUSH, "FUSE_FLUSH") \ + EM( FUSE_INIT, "FUSE_INIT") \ + EM( FUSE_OPENDIR, "FUSE_OPENDIR") \ + EM( FUSE_READDIR, "FUSE_READDIR") \ + EM( FUSE_RELEASEDIR, "FUSE_RELEASEDIR") \ + EM( FUSE_FSYNCDIR, "FUSE_FSYNCDIR") \ + EM( FUSE_GETLK, "FUSE_GETLK") \ + EM( FUSE_SETLK, "FUSE_SETLK") \ + EM( FUSE_SETLKW, "FUSE_SETLKW") \ + EM( FUSE_ACCESS, "FUSE_ACCESS") \ + EM( FUSE_CREATE, "FUSE_CREATE") \ + EM( FUSE_INTERRUPT, "FUSE_INTERRUPT") \ + EM( FUSE_BMAP, "FUSE_BMAP") \ + EM( FUSE_DESTROY, "FUSE_DESTROY") \ + EM( FUSE_IOCTL, "FUSE_IOCTL") \ + EM( FUSE_POLL, "FUSE_POLL") \ + EM( FUSE_NOTIFY_REPLY, "FUSE_NOTIFY_REPLY") \ + EM( FUSE_BATCH_FORGET, "FUSE_BATCH_FORGET") \ + EM( FUSE_FALLOCATE, "FUSE_FALLOCATE") \ + EM( FUSE_READDIRPLUS, "FUSE_READDIRPLUS") \ + EM( FUSE_RENAME2, "FUSE_RENAME2") \ + EM( FUSE_LSEEK, "FUSE_LSEEK") \ + EM( FUSE_COPY_FILE_RANGE, "FUSE_COPY_FILE_RANGE") \ + EM( FUSE_SETUPMAPPING, "FUSE_SETUPMAPPING") \ + EM( FUSE_REMOVEMAPPING, "FUSE_REMOVEMAPPING") \ + EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ + EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ + EM( FUSE_STATX, "FUSE_STATX") \ + EMe(CUSE_INIT, "CUSE_INIT") + +/* + * This will turn the above table into TRACE_DEFINE_ENUM() for each of the + * entries. + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +OPCODES + +/* Now we redfine it with the table that __print_symbolic needs. */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(enum fuse_opcode, opcode) + __field(uint32_t, len) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->opcode = req->in.h.opcode; + __entry->len = req->in.h.len; + ), + + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_end, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(uint32_t, len) + __field(int32_t, error) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->len = req->out.h.len; + __entry->error = req->out.h.error; + ), + + TP_printk("connection %u req %llu len %u error %d", __entry->connection, + __entry->unique, __entry->len, __entry->error) +); + +#endif /* _TRACE_FUSE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE fuse_trace +#include From 5d980fd544f82c50005c19429bc0c1b14eac36f9 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 16:38:44 +0200 Subject: [PATCH 120/161] fuse: {io-uring} Avoid _send code dup fuse_uring_send_next_to_ring() can just call into fuse_uring_send and avoid code dup. Signed-off-by: Bernd Schubert (cherry picked from commit 9efaa8dfc77901a8b0cf24f65de5963ef23f170e) (cherry picked from commit 620e3e80dfa96cb722d450cea39b3c45e4683f3d) --- fs/fuse/dev_uring.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 133c5a284203e8..9629352e886a60 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -769,6 +769,20 @@ static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, return err; } +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + /* * Write data to the ring buffer and send the request to userspace, * userspace will read it @@ -778,22 +792,13 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, struct fuse_req *req, unsigned int issue_flags) { - struct fuse_ring_queue *queue = ent->queue; int err; - struct io_uring_cmd *cmd; err = fuse_uring_prepare_send(ent, req); if (err) return err; - spin_lock(&queue->lock); - cmd = ent->cmd; - ent->cmd = NULL; - ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); - spin_unlock(&queue->lock); - - io_uring_cmd_done(cmd, 0, 0, issue_flags); + fuse_uring_send(ent, ent->cmd, 0, issue_flags); return 0; } @@ -1255,20 +1260,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; } -static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, - ssize_t ret, unsigned int issue_flags) -{ - struct fuse_ring_queue *queue = ent->queue; - - spin_lock(&queue->lock); - ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); - ent->cmd = NULL; - spin_unlock(&queue->lock); - - io_uring_cmd_done(cmd, ret, 0, issue_flags); -} - /* * This prepares and sends the ring request in fuse-uring task context. * User buffers are not mapped yet - the application does not have permission From 13fa892b1ef879104a0d130710e4234e47acabee Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 16:17:23 +0200 Subject: [PATCH 121/161] fuse: fine-grained request ftraces Rename trace_fuse_request_send to trace_fuse_request_enqueue Add trace_fuse_request_send Add trace_fuse_request_bg_enqueue Add trace_fuse_request_enqueue This helps to track entire request time and time in different queues. Signed-off-by: Bernd Schubert (cherry picked from commit 4a7f14274fc223e50b36f428e1b6acd661b73f53) (cherry picked from commit 7b516258f3dd8707cc777c0e2476cee8c8878708) --- fs/fuse/dev.c | 7 +++++- fs/fuse/dev_uring.c | 2 ++ fs/fuse/fuse_trace.h | 51 +++++++++++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ef7e7ca8791c30..6b18de25501f9b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -261,7 +261,9 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); + + /* enqueue, as it is send to "fiq->ops queue" */ + trace_fuse_request_enqueue(req); fiq->ops->send_req(fiq, req); } @@ -568,6 +570,8 @@ static int fuse_request_queue_background(struct fuse_req *req) } __set_bit(FR_ISREPLY, &req->flags); + trace_fuse_request_bg_enqueue(req); + #ifdef CONFIG_FUSE_IO_URING if (fuse_uring_ready(fc)) return fuse_request_queue_background_uring(fc, req); @@ -1286,6 +1290,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); spin_unlock(&fiq->lock); + trace_fuse_request_send(req); args = req->args; reqsize = req->in.h.len; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 9629352e886a60..8efc58056ee999 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,6 +7,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include #include @@ -780,6 +781,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); + trace_fuse_request_send(ent->fuse_req); io_uring_cmd_done(cmd, ret, 0, issue_flags); } diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index bbe9ddd8c71696..393c630e772635 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -77,30 +77,55 @@ OPCODES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -TRACE_EVENT(fuse_request_send, +#define FUSE_REQ_TRACE_FIELDS \ + __field(dev_t, connection) \ + __field(uint64_t, unique) \ + __field(enum fuse_opcode, opcode) \ + __field(uint32_t, len) \ + +#define FUSE_REQ_TRACE_ASSIGN(req) \ + do { \ + __entry->connection = req->fm->fc->dev; \ + __entry->unique = req->in.h.unique; \ + __entry->opcode = req->in.h.opcode; \ + __entry->len = req->in.h.len; \ + } while (0) + + +TRACE_EVENT(fuse_request_enqueue, TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_bg_enqueue, + TP_PROTO(const struct fuse_req *req), TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), - TP_STRUCT__entry( - __field(dev_t, connection) - __field(uint64_t, unique) - __field(enum fuse_opcode, opcode) - __field(uint32_t, len) - ), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); - TP_fast_assign( - __entry->connection = req->fm->fc->dev; - __entry->unique = req->in.h.unique; - __entry->opcode = req->in.h.opcode; - __entry->len = req->in.h.len; - ), +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), TP_printk("connection %u req %llu opcode %u (%s) len %u ", __entry->connection, __entry->unique, __entry->opcode, __print_symbolic(__entry->opcode, OPCODES), __entry->len) ); + TRACE_EVENT(fuse_request_end, TP_PROTO(const struct fuse_req *req), From 4c09f82d15b19b8e65dea48fc22d840a56a59a34 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 16 Dec 2024 22:14:07 +0100 Subject: [PATCH 122/161] fuse: Increase FUSE_NAME_MAX to PATH_MAX Our file system has a translation capability for S3-to-posix. The current value of 1kiB is enough to cover S3 keys, but does not allow encoding of %xx escape characters. The limit is increased to (PATH_MAX - 1), as we need 3 x 1024 and that is close to PATH_MAX (4kB) already. -1 is used as the terminating null is not included in the length calculation. Testing large file names was hard with libfuse/example file systems, so I created a new memfs that does not have a 255 file name length limitation. https://github.com/libfuse/libfuse/pull/1077 The connection is initialized with FUSE_NAME_LOW_MAX, which is set to the previous value of FUSE_NAME_MAX of 1024. With FUSE_MIN_READ_BUFFER of 8192 that is enough for two file names + fuse headers. When FUSE_INIT reply sets max_pages to a value > 1 we know that fuse daemon supports request buffers of at least 2 pages (+ header) and can therefore hold 2 x PATH_MAX file names - operations like rename or link that need two file names are no issue then. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 27992ef80770d61a57f6c3a551735b08cefdffa3) (cherry picked from commit 573e7ab7a14208c287393c0b4b6d7c2b1914b62e) (cherry picked from commit 01902df48c2af67a7dfee94bfd9797691aafd253) --- fs/fuse/dev.c | 4 ++-- fs/fuse/dir.c | 2 +- fs/fuse/fuse_i.h | 11 +++++++++-- fs/fuse/inode.c | 8 ++++++++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6b18de25501f9b..fa84ba91873f68 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1503,7 +1503,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; @@ -1552,7 +1552,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 973c65af07d913..dd017b5acc54d9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -372,7 +372,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = NULL; err = -ENAMETOOLONG; - if (name->len > FUSE_NAME_MAX) + if (name->len > fm->fc->name_max) goto out; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0368994a98746f..ab6a15b9a52056 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -38,8 +38,12 @@ /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN -/** It could be as large as PATH_MAX, but would that have any uses? */ -#define FUSE_NAME_MAX 1024 +/** Maximum length of a filename, not including terminating null */ + +/* maximum, small enough for FUSE_MIN_READ_BUFFER*/ +#define FUSE_NAME_LOW_MAX 1024 +/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */ +#define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 @@ -856,6 +860,9 @@ struct fuse_conn { /** Version counter for evict inode */ atomic64_t evict_ctr; + /* maximum file name length */ + u32 name_max; + /** Called on final put */ void (*release)(struct fuse_conn *); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5b1e56ded80582..cc2e87a8c10a90 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -969,6 +969,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = fuse_max_pages_limit; + fc->name_max = FUSE_NAME_LOW_MAX; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -1318,6 +1319,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); + + /* + * PATH_MAX file names might need two pages for + * ops like rename + */ + if (fc->max_pages > 1) + fc->name_max = FUSE_NAME_MAX; } if (IS_ENABLED(CONFIG_FUSE_DAX)) { if (flags & FUSE_MAP_ALIGNMENT && From 808809a241d47caebdb0cd5f262f317d640e5556 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Mon, 26 Feb 2024 11:54:35 +0800 Subject: [PATCH 123/161] fuse: add support for explicit export disabling open_by_handle_at(2) can fail with -ESTALE with a valid handle returned by a previous name_to_handle_at(2) for evicted fuse inodes, which is especially common when entry_valid_timeout is 0, e.g. when the fuse daemon is in "cache=none" mode. The time sequence is like: name_to_handle_at(2) # succeed evict fuse inode open_by_handle_at(2) # fail The root cause is that, with 0 entry_valid_timeout, the dput() called in name_to_handle_at(2) will trigger iput -> evict(), which will send FUSE_FORGET to the daemon. The following open_by_handle_at(2) will send a new FUSE_LOOKUP request upon inode cache miss since the previous inode eviction. Then the fuse daemon may fail the FUSE_LOOKUP request with -ENOENT as the cached metadata of the requested inode has already been cleaned up during the previous FUSE_FORGET. The returned -ENOENT is treated as -ESTALE when open_by_handle_at(2) returns. This confuses the application somehow, as open_by_handle_at(2) fails when the previous name_to_handle_at(2) succeeds. The returned errno is also confusing as the requested file is not deleted and already there. It is reasonable to fail name_to_handle_at(2) early in this case, after which the application can fallback to open(2) to access files. Since this issue typically appears when entry_valid_timeout is 0 which is configured by the fuse daemon, the fuse daemon is the right person to explicitly disable the export when required. Also considering FUSE_EXPORT_SUPPORT actually indicates the support for lookups of "." and "..", and there are existing fuse daemons supporting export without FUSE_EXPORT_SUPPORT set, for compatibility, we add a new INIT flag for such purpose. Reviewed-by: Amir Goldstein Signed-off-by: Jingbo Xu Signed-off-by: Miklos Szeredi (cherry picked from commit e022f6a1c711ab6d76e9e59dce77e2b25df75076) (cherry picked from commit 52da885067f8e85cd263f991f1e79ef84e6d7504) (cherry picked from commit 32e2e9a29a0518ee7ffc484f757b298ecbc10654) --- fs/fuse/inode.c | 11 ++++++++++- include/uapi/linux/fuse.h | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc2e87a8c10a90..045d9f6c47b499 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1165,6 +1165,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -1349,6 +1354,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->direct_io_allow_mmap = 1; if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) fc->io_uring = 1; + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1395,7 +1402,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1602,6 +1610,7 @@ static int fuse_fill_super_submount(struct super_block *sb, sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c656bc1b603eae..0a000baedb4a2c 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -205,6 +205,12 @@ * 7.39 * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures +* + * 7.40 + * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag + * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag + * - add FUSE_NO_EXPORT_SUPPORT init flag + * * 7.42 * - Add FUSE_OVER_IO_URING and all other io-uring related flags and data * structures: @@ -413,6 +419,7 @@ struct fuse_file_lock { * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. + * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_OVER_IO_URING: Indicate that client supports io-uring */ #define FUSE_ASYNC_READ (1 << 0) @@ -453,6 +460,7 @@ struct fuse_file_lock { #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) +#define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP From 591998eab7f065dcd460b6ae2f6b0d2b5cc3cdd9 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 22 Aug 2023 21:48:18 +0200 Subject: [PATCH 124/161] fuse: create helper function if DIO write needs exclusive lock This makes the code a bit easier to read and allows to more easily add more conditions when an exclusive lock is needed. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 699cf8246ee4c2c524f18c2e395909d16e7fda1b) (cherry picked from commit f400249f41021e020e2d17339b3b0e8070ee3bd7) (cherry picked from commit 89e42adde0b87cefd7f83b46c0207b96ebeeac06) --- fs/fuse/file.c | 63 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d8380a2948cd50..1006ccb7bdcccb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1297,6 +1297,47 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* + * Combination of page access and direct-io is difficult, shared locks + * actually introduce a conflict. + */ + if (get_fuse_conn(inode)->direct_io_allow_mmap) + return true; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1579,26 +1620,12 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); + bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from); /* * Take exclusive lock if @@ -1612,10 +1639,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) else { inode_lock_shared(inode); - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. + /* + * Previous check was without any lock and might have raced. */ - if (fuse_direct_write_extending_i_size(iocb, from)) { + if (fuse_io_past_eof(iocb, from)) { inode_unlock_shared(inode); inode_lock(inode); exclusive_lock = true; From e44445220242c7c0db72536e1cef29b47a9cecd1 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 13:48:59 +0200 Subject: [PATCH 125/161] fuse: factor out helper fuse_truncate_update_attr() fuse_finish_open() is called from fuse_open_common() and from fuse_create_open(). In the latter case, the O_TRUNC flag is always cleared in finish_open()m before calling into fuse_finish_open(). Move the bits that update attribute cache post O_TRUNC open into a helper and call this helper from fuse_open_common() directly. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit 0c9d708953d02f74cea05a01cf3e2c8f5a9fbaf4) (cherry picked from commit 110fb1303c7e36d3f85b2e1b2dba59a736a44a73) (cherry picked from commit 9240e3148064d2e63d072bc6bc72350ba9e6eb16) --- fs/fuse/file.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1006ccb7bdcccb..ec232325d6a47a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -203,30 +203,31 @@ void fuse_finish_open(struct inode *inode, struct file *file) else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -249,15 +250,18 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_set_nowrite(inode); err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) + if (!err) { fuse_finish_open(inode, file); + if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { struct fuse_file *ff = file->private_data; - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); From 6404722aadee7ed9e6d7fcaf339abc36bd200a85 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 15:30:05 +0200 Subject: [PATCH 126/161] fuse: allocate ff->release_args only if release is needed This removed the need to pass isdir argument to fuse_put_file(). Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit e26ee4efbc79610b20e7abe9d96c87f33dacc1ff) (cherry picked from commit cbabbdd5330938e2cb3349a68714a62b4d72ce8f) (cherry picked from commit 894fa3e2a589fe39dc807987d009c9ee3ed927b8) --- fs/fuse/dir.c | 2 +- fs/fuse/file.c | 69 +++++++++++++++++++++++++++--------------------- fs/fuse/fuse_i.h | 2 +- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd017b5acc54d9..1fb5f9f3a0298a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -636,7 +636,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ec232325d6a47a..93df117acd62cf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -54,7 +54,7 @@ struct fuse_release_args { struct inode *inode; }; -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -63,11 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); + if (!ff->release_args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); @@ -103,14 +105,14 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = ff->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -130,15 +132,16 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { + if (open) { struct fuse_open_out outarg; int err; @@ -146,11 +149,13 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->release_args); + ff->release_args = NULL; if (isdir) fc->no_opendir = 1; else @@ -276,7 +281,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; @@ -294,6 +299,9 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -303,6 +311,13 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, @@ -312,14 +327,12 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -330,7 +343,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -365,12 +378,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -927,7 +936,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, put_page(page); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -1719,7 +1728,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) __free_page(ap->pages[i]); if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); @@ -1967,7 +1976,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -2365,7 +2374,7 @@ static int fuse_writepages(struct address_space *mapping, fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ab6a15b9a52056..3cb7887c9e90a9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1085,7 +1085,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, */ int fuse_open_common(struct inode *inode, struct file *file, bool isdir); -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); From 8b0b6225a2328e4813260c7defd7d56aefaf10ff Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:30:30 +0200 Subject: [PATCH 127/161] fuse: break up fuse_open_common() fuse_open_common() has a lot of code relevant only for regular files and O_TRUNC in particular. Copy the little bit of remaining code into fuse_dir_open() and stop using this common helper for directory open. Also split out fuse_dir_finish_open() from fuse_finish_open() before we add inode io modes to fuse_finish_open(). Suggested-by: Miklos Szeredi Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit 7de64d521bf92396b7da8ae0600188ea5d75a4c9) (cherry picked from commit 8dcafbe8ca0ee36f6f8f9d2a6290017aea1acc4f) (cherry picked from commit e8ba8b080814ab47fde4113aaa23e588eb073a73) --- fs/fuse/dir.c | 25 ++++++++++++++++++++++++- fs/fuse/file.c | 9 ++------- fs/fuse/fuse_i.h | 5 ----- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1fb5f9f3a0298a..fd4de4ec6015d0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1640,7 +1640,30 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + struct fuse_mount *fm = get_fuse_mount(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fm, get_node_id(inode), file, true); + if (!err) { + struct fuse_file *ff = file->private_data; + + /* + * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for + * directories for backward compatibility, though it's unlikely + * to be useful. + */ + if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) + nonseekable_open(inode, file); + } + + return err; } static int fuse_dir_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 93df117acd62cf..8509a06b525a00 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -225,7 +225,7 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; @@ -254,7 +254,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); + err = fuse_do_open(fm, get_node_id(inode), file, false); if (!err) { fuse_finish_open(inode, file); if (is_truncate) @@ -352,11 +352,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3cb7887c9e90a9..c477a1da559651 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1080,11 +1080,6 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); -/** - * Send OPEN or OPENDIR request - */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); - struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); From 3602d043878d998a6e61960e8ecf933c7f3f9956 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 15:38:06 +0200 Subject: [PATCH 128/161] fuse: prepare for failing open response In preparation for inode io modes, a server open response could fail due to conflicting inode io modes. Allow returning an error from fuse_finish_open() and handle the error in the callers. fuse_finish_open() is used as the callback of finish_open(), so that FMODE_OPENED will not be set if fuse_finish_open() fails. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit d2c487f150ae00e3cb9faf57aceacc584e0a130c) (cherry picked from commit c1531c7169fc36a913020f7605255377066221e1) (cherry picked from commit 4062a7afef646f6ac9dcfb55c8ed83f6dee12a43) --- fs/fuse/dir.c | 8 +++++--- fs/fuse/file.c | 15 ++++++++++----- fs/fuse/fuse_i.h | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index fd4de4ec6015d0..f6c3a7a730185b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -698,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); - err = finish_open(file, entry, generic_file_open); + err = generic_file_open(inode, file); + if (!err) { + file->private_data = ff; + err = finish_open(file, entry, fuse_finish_open); + } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { - file->private_data = ff; - fuse_finish_open(inode, file); if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8509a06b525a00..7c395e4fc4f58c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -198,7 +198,7 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); @@ -210,6 +210,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } static void fuse_truncate_update_attr(struct inode *inode, struct file *file) @@ -228,7 +230,9 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; bool is_wb_truncate = is_truncate && fc->writeback_cache; @@ -256,16 +260,17 @@ static int fuse_open(struct inode *inode, struct file *file) err = fuse_do_open(fm, get_node_id(inode), file, false); if (!err) { - fuse_finish_open(inode, file); - if (is_truncate) + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) fuse_truncate_update_attr(inode, file); } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c477a1da559651..23d71fa263ff2a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1082,7 +1082,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +int fuse_finish_open(struct inode *inode, struct file *file); void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags); From 9000b3da7c934563904b6ab78bf5b6928205648a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 16:26:15 +0200 Subject: [PATCH 129/161] fuse: introduce inode io modes The fuse inode io mode is determined by the mode of its open files/mmaps and parallel dio opens and expressed in the value of fi->iocachectr: > 0 - caching io: files open in caching mode or mmap on direct_io file < 0 - parallel dio: direct io mode with parallel dio writes enabled == 0 - direct io: no files open in caching mode and no files mmaped Note that iocachectr value of 0 might become positive or negative, while non-parallel dio is getting processed. direct_io mmap uses page cache, so first mmap will mark the file as ff->io_opened and increment fi->iocachectr to enter the caching io mode. If the server opens the file in caching mode while it is already open for parallel dio or vice versa the open fails. This allows executing parallel dio when inode is not in caching mode and no mmaps have been performed on the inode in question. Signed-off-by: Bernd Schubert Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit cb098dd24bab8a315aa00bab1ccddb6be872156d) (cherry picked from commit 0a64c3c6f057b60f9e43770604169d6fa6ca2d06) (cherry picked from commit 0d393d21c892f82adbd3e789282b9b17fed7d2e9) --- fs/fuse/Makefile | 1 + fs/fuse/file.c | 15 +++++ fs/fuse/fuse_i.h | 17 ++++- fs/fuse/iomode.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 fs/fuse/iomode.c diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6cc00eb7761f60..47e649f17f25ae 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o fuse-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7c395e4fc4f58c..41133db71d79fa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -111,6 +111,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) struct fuse_release_args *ra = ff->release_args; struct fuse_args *args = (ra ? &ra->args : NULL); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + if (!args) { /* Do nothing when server does not implement 'open' */ } else if (sync) { @@ -202,6 +205,11 @@ int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); @@ -2529,6 +2537,7 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + int rc; /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(file_inode(file))) @@ -2548,6 +2557,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) /* MAP_PRIVATE */ return generic_file_mmap(file, vma); } + + /* First mmap of direct_io file enters caching inode io mode. */ + rc = fuse_file_cached_io_start(file_inode(file), ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -3316,6 +3330,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 23d71fa263ff2a..566f13fe051409 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,7 +119,7 @@ struct fuse_inode { u64 attr_version; union { - /* Write related fields (regular file only) */ + /* read/write io cache (regular file only) */ struct { /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; @@ -131,6 +131,9 @@ struct fuse_inode { * (FUSE_NOWRITE) means more writes are blocked */ int writectr; + /** Number of files/maps using page cache */ + int iocachectr; + /* Waitq for writepage completion */ wait_queue_head_t page_waitq; @@ -195,6 +198,8 @@ enum { FUSE_I_BAD, /* Has btime */ FUSE_I_BTIME, + /* Wants or already has page cache IO */ + FUSE_I_CACHE_IO_MODE, }; struct fuse_conn; @@ -252,6 +257,9 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + /** Does file hold a fi->iocachectr refcount? */ + enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; + /** Has flock been performed on this file? */ bool flock:1; }; @@ -1394,8 +1402,13 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); int fuse_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); -/* file.c */ +/* iomode.c */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_io_open(struct file *file, struct inode *inode); +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); + +/* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c new file mode 100644 index 00000000000000..a1a836b2aacc91 --- /dev/null +++ b/fs/fuse/iomode.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE inode io modes. + * + * Copyright (c) 2024 CTERA Networks. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include + +/* + * Start cached io mode, where parallel dio writes are not allowed. + */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + + /* There are no io modes if server does not implement open */ + if (!ff->release_args) + return 0; + + spin_lock(&fi->lock); + if (fi->iocachectr < 0) { + err = -ETXTBSY; + goto unlock; + } + WARN_ON(ff->iomode == IOM_UNCACHED); + if (ff->iomode == IOM_NONE) { + ff->iomode = IOM_CACHED; + if (fi->iocachectr == 0) + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + fi->iocachectr++; + } +unlock: + spin_unlock(&fi->lock); + return err; +} + +static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr <= 0); + WARN_ON(ff->iomode != IOM_CACHED); + ff->iomode = IOM_NONE; + fi->iocachectr--; + if (fi->iocachectr == 0) + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); +} + +/* Start strictly uncached io mode where cache access is not allowed */ +static int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + + spin_lock(&fi->lock); + if (fi->iocachectr > 0) { + err = -ETXTBSY; + goto unlock; + } + WARN_ON(ff->iomode != IOM_NONE); + fi->iocachectr--; + ff->iomode = IOM_UNCACHED; +unlock: + spin_unlock(&fi->lock); + return err; +} + +static void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr >= 0); + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fi->iocachectr++; + spin_unlock(&fi->lock); +} + +/* Request access to submit new io to inode via open file */ +int fuse_file_io_open(struct file *file, struct inode *inode) +{ + struct fuse_file *ff = file->private_data; + int err; + + /* + * io modes are not relevant with DAX and with server that does not + * implement open. + */ + if (FUSE_IS_DAX(inode) || !ff->release_args) + return 0; + + /* + * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. + */ + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; + + /* + * First parallel dio open denies caching inode io mode. + * First caching file open enters caching inode io mode. + * + * Note that if user opens a file open with O_DIRECT, but server did + * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, + * so we put the inode in caching mode to prevent parallel dio. + */ + if (ff->open_flags & FOPEN_DIRECT_IO) { + if (ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) + err = fuse_file_uncached_io_start(inode, ff); + else + return 0; + } else { + err = fuse_file_cached_io_start(inode, ff); + } + if (err) + goto fail; + + return 0; + +fail: + pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", + ff->open_flags, err); + /* + * The file open mode determines the inode io mode. + * Using incorrect open mode is a server mistake, which results in + * user visible failure of open() with EIO error. + */ + return -EIO; +} + +/* No more pending io and no new io possible to inode via open/mmapped file */ +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) +{ + /* + * Last parallel dio close allows caching inode io mode. + * Last caching file close exits caching inode io mode. + */ + switch (ff->iomode) { + case IOM_NONE: + /* Nothing to do */ + break; + case IOM_UNCACHED: + fuse_file_uncached_io_end(inode, ff); + break; + case IOM_CACHED: + fuse_file_cached_io_end(inode, ff); + break; + } +} From 2fd474eda715e894560897a06fde5df658b8db48 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sun, 24 Dec 2023 00:05:53 +0100 Subject: [PATCH 130/161] fuse: add fuse_dio_lock/unlock helper functions So far this is just a helper to remove complex locking logic out of fuse_direct_write_iter. Especially needed by the next patch in the series to that adds the fuse inode cache IO mode and adds in even more locking complexity. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 9bbb6717dfd286a2861ca33273f4d7c3e65423b0) (cherry picked from commit a709f7b020f1bc96c141ea9ceffeb8c8312a035c) (cherry picked from commit 8ac07a725c6dd841718bf3282a7aa93988375df7) --- fs/fuse/file.c | 61 ++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 41133db71d79fa..c6b2bed55bbe2e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1359,6 +1359,37 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from return false; } +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * Previous check was without inode lock and might have raced, + * check again. + */ + if (fuse_io_past_eof(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct inode *inode, bool exclusive) +{ + if (exclusive) { + inode_unlock(inode); + } else { + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1646,30 +1677,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* - * Previous check was without any lock and might have raced. - */ - if (fuse_io_past_eof(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1680,10 +1690,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(inode, exclusive); return res; } From bd10cabccd615f7fd4d93a2e529d5bd66651425c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Feb 2024 16:54:37 +0200 Subject: [PATCH 131/161] fuse: allow parallel dio writes with FUSE_DIRECT_IO_ALLOW_MMAP Instead of denying caching mode on parallel dio open, deny caching open only while parallel dio are in-progress and wait for in-progress parallel dio writes before entering inode caching io mode. This allows executing parallel dio when inode is not in caching mode even if shared mmap is allowed, but no mmaps have been performed on the inode in question. An mmap on direct_io file now waits for all in-progress parallel dio writes to complete, so parallel dio writes together with FUSE_DIRECT_IO_ALLOW_MMAP is enabled by this commit. Signed-off-by: Bernd Schubert Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi (cherry picked from commit 205c1d8026835746d8597e1aa70c370e014e83fa) (cherry picked from commit afe3358f1798425a86357f6771629d002d3904e7) (cherry picked from commit 500858bc350e9cd0a7128b4fd852727a70a8f4ee) --- fs/fuse/file.c | 41 +++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 5 +++++ fs/fuse/iomode.c | 48 ++++++++++++++++++++++++++++++------------------ 3 files changed, 64 insertions(+), 30 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c6b2bed55bbe2e..157b1a1422cc6b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1333,6 +1333,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); /* Server side has to advise that it supports parallel dio writes. */ if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) @@ -1345,12 +1346,9 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from if (iocb->ki_flags & IOCB_APPEND) return true; - /* - * Combination of page access and direct-io is difficult, shared locks - * actually introduce a conflict. - */ - if (get_fuse_conn(inode)->direct_io_allow_mmap) - return true; + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return false; /* Parallel dio beyond EOF is not supported, at least for now. */ if (fuse_io_past_eof(iocb, from)) @@ -1363,6 +1361,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { @@ -1370,10 +1369,14 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, } else { inode_lock_shared(inode); /* - * Previous check was without inode lock and might have raced, - * check again. + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. */ - if (fuse_io_past_eof(iocb, from)) { + if (fuse_io_past_eof(iocb, from) || + fuse_file_uncached_io_start(inode, ff) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -1381,11 +1384,16 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, } } -static void fuse_dio_unlock(struct inode *inode, bool exclusive) +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; + if (exclusive) { inode_unlock(inode); } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_file_uncached_io_end(inode, ff); inode_unlock_shared(inode); } } @@ -1690,7 +1698,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - fuse_dio_unlock(inode, exclusive); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -2550,6 +2558,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { /* * Can't provide the coherency needed for MAP_SHARED @@ -2565,7 +2577,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } - /* First mmap of direct_io file enters caching inode io mode. */ + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + */ rc = fuse_file_cached_io_start(file_inode(file), ff); if (rc) return rc; @@ -3339,6 +3355,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 566f13fe051409..0d1fd38405a603 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -137,6 +137,9 @@ struct fuse_inode { /* Waitq for writepage completion */ wait_queue_head_t page_waitq; + /* waitq for direct-io completion */ + wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ struct rb_root writepages; }; @@ -1404,6 +1407,8 @@ int fuse_fileattr_set(struct user_namespace *mnt_userns, /* iomode.c */ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff); +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); int fuse_file_io_open(struct file *file, struct inode *inode); void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index a1a836b2aacc91..ea47c76b9df114 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -13,21 +13,37 @@ #include /* - * Start cached io mode, where parallel dio writes are not allowed. + * Return true if need to wait for new opens in caching mode. + */ +static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) +{ + return READ_ONCE(fi->iocachectr) < 0; +} + +/* + * Start cached io mode. + * + * Blocks new parallel dio writes and waits for the in-progress parallel dio + * writes to complete. */ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); - int err = 0; /* There are no io modes if server does not implement open */ if (!ff->release_args) return 0; spin_lock(&fi->lock); - if (fi->iocachectr < 0) { - err = -ETXTBSY; - goto unlock; + /* + * Setting the bit advises new direct-io writes to use an exclusive + * lock - without it the wait below might be forever. + */ + while (fuse_is_io_cache_wait(fi)) { + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); + spin_lock(&fi->lock); } WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { @@ -36,9 +52,8 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); fi->iocachectr++; } -unlock: spin_unlock(&fi->lock); - return err; + return 0; } static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) @@ -56,7 +71,7 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -static int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -74,7 +89,7 @@ static int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff return err; } -static void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -83,6 +98,8 @@ static void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fi->iocachectr++; + if (!fi->iocachectr) + wake_up(&fi->direct_io_waitq); spin_unlock(&fi->lock); } @@ -106,21 +123,16 @@ int fuse_file_io_open(struct file *file, struct inode *inode) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* - * First parallel dio open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ - if (ff->open_flags & FOPEN_DIRECT_IO) { - if (ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) - err = fuse_file_uncached_io_start(inode, ff); - else - return 0; - } else { - err = fuse_file_cached_io_start(inode, ff); - } + if (ff->open_flags & FOPEN_DIRECT_IO) + return 0; + + err = fuse_file_cached_io_start(inode, ff); if (err) goto fail; From f83b1cd20d7b458f6f98323ddcc99a2fd3066767 Mon Sep 17 00:00:00 2001 From: yangyun Date: Wed, 14 Aug 2024 17:36:00 +0800 Subject: [PATCH 132/161] fuse: add fast path for fuse_range_is_writeback In some cases, the fi->writepages may be empty. And there is no need to check fi->writepages with spin_lock, which may have an impact on performance due to lock contention. For example, in scenarios where multiple readers read the same file without any writers, or where the page cache is not enabled. Also remove the outdated comment since commit 6b2fb79963fb ("fuse: optimize writepages search") has optimize the situation by replacing list with rb-tree. Signed-off-by: yangyun Signed-off-by: Miklos Szeredi (cherry picked from commit ac5cffec53be0b0231b89470a357bd3a5814f599) (cherry picked from commit ba1236c0fd70ac0ba37d9602a1152a901a8e6b8a) (cherry picked from commit 0781cd2bd63831d28c93aed5cd7009a4175c96dd) --- fs/fuse/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 157b1a1422cc6b..18927b2395b154 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -448,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, /* * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. */ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to) @@ -458,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, struct fuse_inode *fi = get_fuse_inode(inode); bool found; + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + spin_lock(&fi->lock); found = fuse_find_writeback(fi, idx_from, idx_to); spin_unlock(&fi->lock); From ca0aa9f0c53059441e5f9fbfd48123a4c81fa0a4 Mon Sep 17 00:00:00 2001 From: yangyun Date: Sat, 14 Sep 2024 16:51:31 +0800 Subject: [PATCH 133/161] fuse: use exclusive lock when FUSE_I_CACHE_IO_MODE is set This may be a typo. The comment has said shared locks are not allowed when this bit is set. If using shared lock, the wait in `fuse_file_cached_io_open` may be forever. Fixes: 205c1d802683 ("fuse: allow parallel dio writes with FUSE_DIRECT_IO_ALLOW_MMAP") CC: stable@vger.kernel.org # v6.9 Signed-off-by: yangyun Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 2f3d8ff457982f4055fe8f7bf19d3821ba22c376) (cherry picked from commit bbddfd7faae92f6a98068175dc3a9becf416f070) (cherry picked from commit 182243701c9cfc2e2c9b808f5fdf9875baa1a992) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 18927b2395b154..d36de5de0d55db 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1348,7 +1348,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from /* shared locks are not allowed with parallel page cache IO */ if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) - return false; + return true; /* Parallel dio beyond EOF is not supported, at least for now. */ if (fuse_io_past_eof(iocb, from)) From 25d0b0f3c301d48b38285bec9d913105c79bc799 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 8 Apr 2025 16:44:55 +0200 Subject: [PATCH 134/161] fuse: Increase the default max pages limit to 8182 Due to user buffer misalignent we actually need one page more, i.e. 1025 instead of 1024, will be handled differently. For now we just bump up the max. (cherry picked from commit 3f71501c9c4702ba976145ff15c4a053ecd1a3ee) (cherry picked from commit e30b59c1cbc3454cb283dc98d0bec195df82cad4) --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 045d9f6c47b499..ee8cb4854668f1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -35,7 +35,7 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); -unsigned int fuse_max_pages_limit = 256; +unsigned int fuse_max_pages_limit = 4097; unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, From 72d220d8f5c4e6442046e92add81c7c00190e59d Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 8 Jan 2025 16:10:27 +0100 Subject: [PATCH 135/161] fuse: {uring} Pin the user buffer This is to allow copying into the buffer from the application without the need to copy in ring context (and with that, the need that the ring task is active in kernel space). Signed-off-by: Bernd Schubert (cherry picked from commit 43d1a63dec17d928609fb9725ac4ab9d6e09803f) (cherry picked from commit ea01f94a55f91fa48cb3a0304b1e41a92707539a) (cherry picked from commit f640a7781a055974d26254a78080c9c3140e14f7) --- fs/fuse/dev.c | 9 ++ fs/fuse/dev_uring.c | 215 ++++++++++++++++++++++++++++++++++++++---- fs/fuse/dev_uring_i.h | 4 + fs/fuse/fuse_dev_i.h | 2 + 4 files changed, 214 insertions(+), 16 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fa84ba91873f68..f4180d3ecbf924 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -752,6 +752,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs++; } + } else if (cs->ring.pages) { + cs->pg = cs->ring.pages[cs->ring.page_idx++]; + /* + * non stricly needed, just to avoid a uring exception in + * fuse_copy_finish + */ + get_page(cs->pg); + cs->len = PAGE_SIZE; + cs->offset = 0; } else { size_t off; err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 8efc58056ee999..f1d1ee2250bbec 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -12,6 +12,7 @@ #include #include #include +#include static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); @@ -19,6 +20,8 @@ MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ +#define FUSE_RING_HEADER_PG 0 +#define FUSE_RING_PAYLOAD_PG 1 /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) @@ -156,6 +159,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +/* + * Copy from memmap.c, should be exported + */ +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -196,6 +214,9 @@ void fuse_uring_destruct(struct fuse_conn *fc) list_for_each_entry_safe(ent, next, &queue->ent_released, list) { list_del_init(&ent->list); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + io_pages_free(&ent->payload_pages, + ent->nr_payload_pages); kfree(ent); } @@ -657,13 +678,67 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, fuse_copy_init(&cs, 0, &iter); cs.is_uring = 1; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); } - /* - * Copy data from the req to the ring buffer - */ +/* + * Copy data from the req to the ring buffer + * In order to be able to write into the ring buffer from the application, + * i.e. to avoid io_uring_cmd_complete_in_task(), the header needs to be + * pinned as well. + */ +static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent, + struct fuse_uring_req_header *headers) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + fuse_copy_init(&cs, 1, NULL); + cs.is_uring = 1; + cs.req = req; + cs.ring.pages = ent->payload_pages; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + memcpy(&headers->op_in, in_args->value, in_args->size); + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + return err; +} + +/* + * Copy data from the req to the ring buffer + */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { @@ -687,6 +762,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_init(&cs, 1, &iter); cs.is_uring = 1; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; if (num_args > 0) { /* @@ -726,6 +803,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; int err; + struct fuse_uring_req_header *headers = NULL; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { @@ -738,22 +816,29 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, if (WARN_ON(req->in.h.unique == 0)) return err; - /* copy the request */ - err = fuse_uring_args_to_ring(ring, req, ent); - if (unlikely(err)) { - pr_info_ratelimited("Copy to ring failed: %d\n", err); - return err; - } - /* copy fuse_in_header */ - err = copy_to_user(&ent->headers->in_out, &req->in.h, - sizeof(req->in.h)); - if (err) { - err = -EFAULT; - return err; + if (ent->header_pages) { + headers = kmap_local_page( + ent->header_pages[FUSE_RING_HEADER_PG]); + + memcpy(&headers->in_out, &req->in.h, sizeof(req->in.h)); + + err = fuse_uring_args_to_ring_pages(ring, req, ent, headers); + kunmap_local(headers); + } else { + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) + err = -EFAULT; } - return 0; + return err; } static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, @@ -1071,6 +1156,45 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, } } +/* + * Copy from memmap.c, should be exported there + */ +static struct page **io_pin_pages(unsigned long uaddr, unsigned long len, + int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + /* * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] * the payload @@ -1097,6 +1221,59 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, return 0; } +static int fuse_uring_pin_pages(struct fuse_ring_ent *ent) +{ + struct fuse_ring *ring = ent->queue->ring; + int err; + + /* + * This needs to do locked memory accounting, for now privileged servers + * only. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* Pin header pages */ + if (!PAGE_ALIGNED(ent->headers)) { + pr_info_ratelimited("ent->headers is not page-aligned: %p\n", + ent->headers); + return -EINVAL; + } + + ent->header_pages = io_pin_pages((unsigned long)ent->headers, + sizeof(struct fuse_uring_req_header), + &ent->nr_header_pages); + if (IS_ERR(ent->header_pages)) { + err = PTR_ERR(ent->header_pages); + pr_info_ratelimited("Failed to pin header pages, err=%d\n", + err); + ent->header_pages = NULL; + return err; + } + + if (ent->nr_header_pages != 1) { + pr_info_ratelimited("Header pages not pinned as one page\n"); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->header_pages = NULL; + return -EINVAL; + } + + /* Pin payload pages */ + ent->payload_pages = io_pin_pages((unsigned long)ent->payload, + ring->max_payload_sz, + &ent->nr_payload_pages); + if (IS_ERR(ent->payload_pages)) { + err = PTR_ERR(ent->payload_pages); + pr_info_ratelimited("Failed to pin payload pages, err=%d\n", + err); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->payload_pages = NULL; + return err; + } + + return 0; +} + static struct fuse_ring_ent * fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_queue *queue) @@ -1138,6 +1315,12 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + err = fuse_uring_pin_pages(ent); + if (err) { + kfree(ent); + return ERR_PTR(err); + } + atomic_inc(&ring->queue_refs); return ent; } diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index d33f8f11258143..bc89bbb5c6c38a 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -40,7 +40,11 @@ enum fuse_ring_req_state { struct fuse_ring_ent { /* userspace buffer */ struct fuse_uring_req_header __user *headers; + struct page **header_pages; + int nr_header_pages; void __user *payload; + struct page **payload_pages; + int nr_payload_pages; /* the ring queue that owns the request */ struct fuse_ring_queue *queue; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 10836e155e8785..83fb7d44d1b686 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -30,6 +30,8 @@ struct fuse_copy_state { unsigned int is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ + struct page **pages; + int page_idx; } ring; }; From 7b667cb6282c97e514a51c2e657d8940c1c1c172 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 17 Jan 2025 22:06:30 +0100 Subject: [PATCH 136/161] fuse: {io-uring] Avoid complete-in-task if pinned pages are used If pinned pages are used the application can write into these pages and io_uring_cmd_complete_in_task() is not needed. Signed-off-by: Bernd Schubert (cherry picked from commit 5f0264c1dc0100e274f3db37511bba0d8043de1c) (cherry picked from commit 6594394d06450adc8d839788efa7489816033187) --- fs/fuse/dev_uring.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f1d1ee2250bbec..6b442ba004c80a 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1488,12 +1488,31 @@ static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) return queue; } -static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) { struct io_uring_cmd *cmd = ent->cmd; - uring_cmd_set_ring_ent(cmd, ent); - io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + /* + * Task needed when pages are not pinned as the application doing IO + * is not allowed to write into fuse-server pages. + * Additionally for IO through io-uring as issue flags are unknown then. + * backgrounds requests might hold spin-locks, that conflict with + * io_uring_cmd_done() mutex lock. + */ + if (!ent->header_pages || current->io_uring || bg) { + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + } else { + int err = fuse_uring_prepare_send(ent, ent->fuse_req); + struct fuse_ring_queue *queue = ent->queue; + + if (err) { + fuse_uring_next_fuse_req(ent, queue, + IO_URING_F_UNLOCKED); + return; + } + fuse_uring_send(ent, cmd, 0, IO_URING_F_UNLOCKED); + } } /* queue a fuse request and send it if a ring entry is available */ @@ -1526,7 +1545,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) spin_unlock(&queue->lock); if (ent) - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, false); return; @@ -1579,7 +1598,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fuse_uring_add_req_to_ring_ent(ent, req); spin_unlock(&queue->lock); - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, true); } else { spin_unlock(&queue->lock); } From 609c849d2d2cfebde7e02dde271b73dffae0214b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 22 Apr 2025 22:42:38 +0200 Subject: [PATCH 137/161] fuse: Add memory barrier in fuse_uring_destruct Add an smp_rmb() before checking list states in fuse_uring_destruct() to ensure proper ordering between list modifications and emptiness checks. During connection teardown lists are checked without holding a lock, and ithout this barrier, the CPU executing fuse_uring_destruct() might see inconsistent list states, leading to false WARN_ON triggers even though the lists have been properly emptied. The smp_rmb() ensures we see the final consistent state of all lists after teardown operations complete on other CPUs. This fixes occasional false WARN_ON triggers during connection teardown. Signed-off-by: Bernd Schubert (cherry picked from commit 2a889c7f6036b324c9fbf98b7dc9748d3a355021) (cherry picked from commit eec78182ab0a84a60a397f229c4df1328294ce00) --- fs/fuse/dev_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 6b442ba004c80a..d0c7e62c28e477 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -206,6 +206,9 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!queue) continue; + /* memory barrier to ensure we see the latest list state */ + smp_rmb(); + WARN_ON(!list_empty(&queue->ent_avail_queue)); WARN_ON(!list_empty(&queue->ent_w_req_queue)); WARN_ON(!list_empty(&queue->ent_commit_queue)); From a5f47205cc0ffea4b9c122a25507de2bedab5988 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 7 May 2025 23:39:00 +0200 Subject: [PATCH 138/161] fuse: Use fuser-server provided read-ahead for CAP_SYS_ADMIN readhead is currently limited to bdi->ra_pages. One can change that after the mount with something like minor=$(stat -c "%d" /path/to/fuse) echo 1024 > /sys/class/bdi/0:$(minor)/read_ahead_kb Issue is that fuse-server cannot do that from its ->init method, as it has to know about device minor, which blocks before init is complete. Fuse already sets the bdi value, but upper limit is the current bdi value. For CAP_SYS_ADMIN we can allow higher values. Signed-off-by: Bernd Schubert (cherry picked from commit 763c96da4bd6d1bb95d8e6bb7fd352389f3a17b9) (cherry picked from commit 99bc3f2c1aa7bbdbd3f46f80bed53a55877f7909) --- fs/fuse/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ee8cb4854668f1..118d36e897f0e5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1362,7 +1362,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->no_flock = 1; } - fm->sb->s_bdi->ra_pages = + if (CAP_SYS_ADMIN) + fm->sb->s_bdi->ra_pages = ra_pages; + else + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; From 24cb2de4158482616f17f0df647c79e934036cf8 Mon Sep 17 00:00:00 2001 From: Guang Yuan Wu Date: Fri, 2 May 2025 04:04:21 +0000 Subject: [PATCH 139/161] fuse: fix race between concurrent setattrs from multiple nodes When mounting a user-space filesystem on multiple clients, after concurrent ->setattr() calls from different node, stale inode attributes may be cached in some node. This is caused by fuse_setattr() racing with fuse_reverse_inval_inode(). When filesystem server receives setattr request, the client node with valid iattr cached will be required to update the fuse_inode's attr_version and invalidate the cache by fuse_reverse_inval_inode(), and at the next call to ->getattr() they will be fetched from user space. The race scenario is: 1. client-1 sends setattr (iattr-1) request to server 2. client-1 receives the reply from server 3. before client-1 updates iattr-1 to the cached attributes by fuse_change_attributes_common(), server receives another setattr (iattr-2) request from client-2 4. server requests client-1 to update the inode attr_version and invalidate the cached iattr, and iattr-1 becomes staled 5. client-2 receives the reply from server, and caches iattr-2 6. continue with step 2, client-1 invokes fuse_change_attributes_common(), and caches iattr-1 The issue has been observed from concurrent of chmod, chown, or truncate, which all invoke ->setattr() call. The solution is to use fuse_inode's attr_version to check whether the attributes have been modified during the setattr request's lifetime. If so, mark the attributes as invalid in the function fuse_change_attributes_common(). Signed-off-by: Guang Yuan Wu Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi (cherry picked from commit 78dec114ee732f493693f45b9435b777baafe69a) (cherry picked from commit 1e71f3f995105cf98bf15da589e3fcb0359f6aa3) --- fs/fuse/dir.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f6c3a7a730185b..7a78deb5afb54d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1888,6 +1888,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -1972,6 +1973,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -1997,6 +2000,14 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); From 2d6cfc032ad15ba27cb49d1ed4ee8af1b7877871 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 20 Jun 2025 17:34:53 +0200 Subject: [PATCH 140/161] fuse: add DLM_LOCK opcode When having writeback cache enabled it is beneficial for data consistency to communicate to the FUSE server when the kernel prepares a page for caching. This lets the FUSE server react and lock the page. Additionally the kernel lets the FUSE server decide how much data it locks by the same call and keeps the given information in the dlm lock management. If the feature is not supported it will be disabled after first unsuccessful use. - Add DLM_LOCK fuse opcode - Add cache page lock caching for writeback cache functionality. This means sending out a FUSE call whenever the kernel prepares a page for writeback cache. The kernel will manage the cache so that it will keep track of already acquired locks. (except for the case that is documented in the code) - Use rb-trees for the management of the already 'locked' page ranges - Use rw_semaphore for synchronization in fuse_dlm_cache (cherry picked from commit 287c8840b60d5cdcf806b16e8cc5722f2dbf0738) (cherry picked from commit 3b0ae3bcb6c2edf35b7d6993e1df52a33e21a41d) --- fs/fuse/Makefile | 2 +- fs/fuse/dir.c | 6 + fs/fuse/file.c | 13 + fs/fuse/fuse_dlm_cache.c | 551 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_dlm_cache.h | 50 ++++ fs/fuse/fuse_i.h | 18 ++ fs/fuse/fuse_trace.h | 1 + fs/fuse/inode.c | 11 + include/uapi/linux/fuse.h | 36 +++ 9 files changed, 687 insertions(+), 1 deletion(-) create mode 100644 fs/fuse/fuse_dlm_cache.c create mode 100644 fs/fuse/fuse_dlm_cache.h diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 47e649f17f25ae..f6952d826bcb33 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 7a78deb5afb54d..3121887e2e6686 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -6,6 +6,7 @@ See the file COPYING. */ +#include "fuse_dlm_cache.h" #include "fuse_i.h" #include @@ -1923,6 +1924,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, * truncation has already been done by OPEN. But still * need to truncate page cache. */ + if (fc->dlm && fc->writeback_cache) + fuse_dlm_cache_release_locks(fi); i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; @@ -2028,6 +2031,9 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (fc->dlm && fc->writeback_cache) + fuse_dlm_unlock_range(fi, outarg.attr.size & PAGE_MASK, -1); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d36de5de0d55db..0fa98adc817fcb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -1420,6 +1421,17 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) goto writethrough; } + /* if we have dlm support acquire the lock for the area + * we are writing into */ + if (fc->dlm) { + /* note that a file opened with O_APPEND will have relative values + * in ki_pos. This code is here for convenience and for libfuse overlay test. + * Filesystems should handle O_APPEND with 'direct io' to additionally + * get the performance benefits of 'parallel direct writes'. */ + loff_t pos = file->f_flags & O_APPEND ? i_size_read(inode) + iocb->ki_pos : iocb->ki_pos; + size_t length = iov_iter_count(from); + fuse_get_dlm_write_lock(file, pos, length); + } return generic_file_write_iter(iocb, from); } @@ -3352,6 +3364,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); + fuse_dlm_cache_init(fi); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c new file mode 100644 index 00000000000000..ea947f34a9f70a --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FUSE page lock cache implementation + */ +#include "fuse_i.h" +#include "fuse_dlm_cache.h" + +#include +#include +#include +#include + + +/* A range of pages with a lock */ +struct fuse_dlm_range { + /* Interval tree node */ + struct rb_node rb; + /* Start page offset (inclusive) */ + pgoff_t start; + /* End page offset (inclusive) */ + pgoff_t end; + /* Subtree end value for interval tree */ + pgoff_t __subtree_end; + /* Lock mode */ + enum fuse_page_lock_mode mode; + /* Temporary list entry for operations */ + struct list_head list; +}; + +/* Lock modes for FUSE page cache */ +#define FUSE_PCACHE_LK_READ 1 /* Shared read lock */ +#define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ + +/* Interval tree definitions for page ranges */ +static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +{ + return range->start; +} + +static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +{ + return range->end; +} + +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); + +/** + * fuse_page_cache_init - Initialize a page cache lock manager + * @cache: The cache to initialize + * + * Initialize a page cache lock manager for a FUSE inode. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_cache_init(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + + if (!cache) + return -EINVAL; + + init_rwsem(&cache->lock); + cache->ranges = RB_ROOT_CACHED; + + return 0; +} + +/** + * fuse_page_cache_destroy - Clean up a page cache lock manager + * @cache: The cache to clean up + * + * Release all locks and free all resources associated with the cache. + */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + struct rb_node *node; + + if (!cache) + return; + + /* Release all locks */ + down_write(&cache->lock); + while ((node = rb_first_cached(&cache->ranges)) != NULL) { + range = rb_entry(node, struct fuse_dlm_range, rb); + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + up_write(&cache->lock); +} + +/** + * fuse_dlm_find_overlapping - Find a range that overlaps with [start, end] + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Return: Pointer to the first overlapping range, or NULL if none found + */ +static struct fuse_dlm_range * +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + return fuse_page_it_iter_first(&cache->ranges, start, end); +} + +/** + * fuse_page_try_merge - Try to merge ranges within a specific region + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Attempt to merge ranges within and adjacent to the specified region + * that have the same lock mode. + */ +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *next; + struct rb_node *node; + + if (!cache) + return; + + /* Find the first range that might need merging */ + range = NULL; + node = rb_first_cached(&cache->ranges); + while (node) { + range = rb_entry(node, struct fuse_dlm_range, rb); + if (range->end >= start - 1) + break; + node = rb_next(node); + } + + if (!range || range->start > end + 1) + return; + + /* Try to merge ranges in and around the specified region */ + while (range && range->start <= end + 1) { + /* Get next range before we potentially modify the tree */ + next = NULL; + if (rb_next(&range->rb)) { + next = rb_entry(rb_next(&range->rb), + struct fuse_dlm_range, rb); + } + + /* Try to merge with next range if adjacent and same mode */ + if (next && range->mode == next->mode && + range->end + 1 == next->start) { + /* Merge ranges */ + range->end = next->end; + + /* Remove next from tree */ + fuse_page_it_remove(next, &cache->ranges); + kfree(next); + + /* Continue with the same range */ + continue; + } + + /* Move to next range */ + range = next; + } +} + +/** + * fuse_dlm_lock_range - Lock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode (read or write) + * + * Add a locked range on the specified range of pages. + * If parts of the range are already locked, only add the remaining parts. + * For overlapping ranges, handle lock compatibility: + * - READ locks are compatible with existing READ locks + * - READ locks are compatible with existing WRITE locks (downgrade not needed) + * - WRITE locks need to upgrade existing READ locks + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *new_range, *next; + int lock_mode; + int ret = 0; + LIST_HEAD(to_lock); + LIST_HEAD(to_upgrade); + pgoff_t current_start = start; + + if (!cache || start > end) + return -EINVAL; + + /* Convert to lock mode */ + lock_mode = (mode == FUSE_PAGE_LOCK_READ) ? FUSE_PCACHE_LK_READ : + FUSE_PCACHE_LK_WRITE; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check lock compatibility */ + if (lock_mode == FUSE_PCACHE_LK_WRITE && + lock_mode != range->mode) { + /* we own the lock but have to update it. */ + list_add_tail(&range->list, &to_upgrade); + } + /* If WRITE lock already exists - nothing to do */ + + /* If there's a gap before this range, we need to add the missing range */ + if (current_start < range->start) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = range->start - 1; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* Move current_start past this range */ + current_start = max(current_start, range->end + 1); + + /* Move to next range */ + range = next; + } + + /* If there's a gap after the last range to the end, extend the range */ + if (current_start <= end) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = end; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* update locks, if any lock is in this list it has the wrong mode */ + list_for_each_entry(range, &to_upgrade, list) { + /* Update the lock mode */ + range->mode = lock_mode; + } + + /* Add all new ranges to the tree */ + list_for_each_entry(new_range, &to_lock, list) { + /* Add to interval tree */ + fuse_page_it_insert(new_range, &cache->ranges); + } + + /* Try to merge adjacent ranges with the same mode */ + fuse_dlm_try_merge(cache, start, end); + + up_write(&cache->lock); + return 0; + +out_free: + /* Free any ranges we allocated but didn't insert */ + while (!list_empty(&to_lock)) { + new_range = + list_first_entry(&to_lock, struct fuse_dlm_range, list); + list_del(&new_range->list); + kfree(new_range); + } + + /* Restore original lock modes for any partially upgraded locks */ + list_for_each_entry(range, &to_upgrade, list) { + if (lock_mode == FUSE_PCACHE_LK_WRITE) { + /* We upgraded this lock but failed later, downgrade it back */ + range->mode = FUSE_PCACHE_LK_READ; + } + } + + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_punch_hole - Punch a hole in a locked range + * @cache: The page cache + * @start: Start page offset of the hole + * @end: End page offset of the hole + * + * Create a hole in a locked range by splitting it into two ranges. + * + * Return: 0 on success, negative error code on failure + */ +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *new_range; + int ret = 0; + + if (!cache || start > end) + return -EINVAL; + + /* Find a range that contains [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + if (!range) { + ret = -EINVAL; + goto out; + } + + /* If the hole is at the beginning of the range */ + if (start == range->start) { + range->start = end + 1; + goto out; + } + + /* If the hole is at the end of the range */ + if (end == range->end) { + range->end = start - 1; + goto out; + } + + /* The hole is in the middle, need to split */ + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out; + } + + /* Copy properties from original range */ + *new_range = *range; + INIT_LIST_HEAD(&new_range->list); + + /* Adjust ranges */ + new_range->start = end + 1; + range->end = start - 1; + + /* Update interval tree */ + fuse_page_it_remove(range, &cache->ranges); + fuse_page_it_insert(range, &cache->ranges); + fuse_page_it_insert(new_range, &cache->ranges); + +out: + return ret; +} + +/** + * fuse_dlm_unlock_range - Unlock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Release locks on the specified range of pages. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, + pgoff_t start, pgoff_t end) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *next; + int ret = 0; + + if (!cache) + return -EINVAL; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check if we need to punch a hole */ + if (start > range->start && end < range->end) { + /* Punch a hole in the middle */ + ret = fuse_dlm_punch_hole(cache, start, end); + if (ret) + goto out; + /* After punching a hole, we're done */ + break; + } else if (start > range->start) { + /* Adjust the end of the range */ + range->end = start - 1; + } else if (end < range->end) { + /* Adjust the start of the range */ + range->start = end + 1; + } else { + /* Complete overlap, remove the range */ + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + + range = next; + } + +out: + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_range_is_locked - Check if a page range is already locked + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode to check for (or NULL to check for any lock) + * + * Check if the specified range of pages is already locked. + * The entire range must be locked for this to return true. + * + * Return: true if the entire range is locked, false otherwise + */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + int lock_mode = 0; + pgoff_t current_start = start; + + if (!cache || start > end) + return false; + + /* Convert to lock mode if specified */ + if (mode == FUSE_PAGE_LOCK_READ) + lock_mode = FUSE_PCACHE_LK_READ; + else if (mode == FUSE_PAGE_LOCK_WRITE) + lock_mode = FUSE_PCACHE_LK_WRITE; + + down_read(&cache->lock); + + /* Find the first range that overlaps with [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + + /* Check if the entire range is covered */ + while (range && current_start <= end) { + /* If we're checking for a specific mode, verify it matches */ + if (lock_mode && range->mode != lock_mode) { + /* Wrong lock mode */ + up_read(&cache->lock); + return false; + } + + /* Check if there's a gap before this range */ + if (current_start < range->start) { + /* Found a gap */ + up_read(&cache->lock); + return false; + } + + /* Move current_start past this range */ + current_start = range->end + 1; + + /* Get next overlapping range */ + range = fuse_page_it_iter_next(range, start, end); + } + + /* Check if we covered the entire range */ + if (current_start <= end) { + /* There's a gap at the end */ + up_read(&cache->lock); + return false; + } + + up_read(&cache->lock); + return true; +} + +/** + * request a dlm lock from the fuse server + */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + + /* note that the offset and length don't have to be page aligned here + * but since we only get here on writeback caching we will send out + * page aligned requests */ + offset &= PAGE_MASK; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + /* note that this can be run from different processes + * at the same time. It is intentionally not protected + * since a DLM implementation in the FUSE server should take care + * of any races in lock requests */ + if (fuse_dlm_range_is_locked(fi, offset, + end, FUSE_PAGE_LOCK_WRITE)) + return; /* we already have this area locked */ + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = end - offset + 1; + inarg.type = FUSE_DLM_LOCK_WRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + /* fuse server does not support dlm, save the info */ + fc->dlm = 0; + return; + } + + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } + + if (err) + return; + else + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, + offset + outarg.locksize - 1, + FUSE_PAGE_LOCK_WRITE); +} diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h new file mode 100644 index 00000000000000..98b27a2c15d8ba --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * FUSE page cache lock implementation + */ + +#ifndef _FS_FUSE_DLM_CACHE_H +#define _FS_FUSE_DLM_CACHE_H + +#include +#include +#include +#include + + +struct fuse_inode; + +/* Lock modes for page ranges */ +enum fuse_page_lock_mode { FUSE_PAGE_LOCK_READ, FUSE_PAGE_LOCK_WRITE }; + +/* Page cache lock manager */ +struct fuse_dlm_cache { + /* Lock protecting the tree */ + struct rw_semaphore lock; + /* Interval tree of locked ranges */ + struct rb_root_cached ranges; +}; + +/* Initialize a page cache lock manager */ +int fuse_dlm_cache_init(struct fuse_inode *inode); + +/* Clean up a page cache lock manager */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode); + +/* Lock a range of pages */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* Unlock a range of pages */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end); + +/* Check if a page range is already locked */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* this is the interface to the filesystem */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length); + +#endif /* _FS_FUSE_DLM_CACHE_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0d1fd38405a603..afdeb18b944c41 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -31,6 +31,7 @@ #include #include #include +#include "fuse_dlm_cache.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -84,6 +85,17 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** + * data structure to save the information that we have + * requested dlm locks for the given area from the fuse server +*/ +struct dlm_locked_area +{ + struct list_head list; + loff_t offset; + size_t size; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -142,6 +154,9 @@ struct fuse_inode { /* List of writepage requestst (pending or sent) */ struct rb_root writepages; + + /* dlm locked areas we have sent lock requests for */ + struct fuse_dlm_cache dlm_locked_areas; }; /* readdir cache (directory only) */ @@ -841,6 +856,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* do we have support for dlm in the fs? */ + unsigned int dlm:1; + /* Use io_uring for communication */ unsigned int io_uring; diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 393c630e772635..9976e31a51a9c9 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,6 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 118d36e897f0e5..0e58ab24d93e9a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include "dev_uring_i.h" #include @@ -183,6 +184,7 @@ static void fuse_evict_inode(struct inode *inode) if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); + fuse_dlm_cache_release_locks(fi); } } @@ -564,6 +566,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; + + if (fc->dlm && fc->writeback_cache) + /* invalidate the range from the beginning of the first page + * in the given range to the last byte of the last page */ + fuse_dlm_unlock_range(fi, + pg_start << PAGE_SHIFT, + (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -962,6 +972,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->blocked = 0; fc->initialized = 0; fc->connected = 1; + fc->dlm = 1; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 0a000baedb4a2c..cccbdae76f8337 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -632,6 +632,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_DLM_WB_LOCK = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1166,6 +1167,41 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Type of the dlm lock requested + */ +enum fuse_dlm_lock_type { + FUSE_DLM_LOCK_NONE = 0, + FUSE_DLM_LOCK_READ = 1, + FUSE_DLM_LOCK_WRITE = 2 +}; + +/** + * struct fuse_dlm_lock_in - Lock request + * @fh: file handle + * @offset: offset into the file + * @size: size of the locked region + * @type: type of lock + */ +struct fuse_dlm_lock_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t type; + uint64_t reserved; +}; + +/** + * struct fuse_dlm_lock_out - Lock response + * @locksize: how many bytes where locked by the call + * (most of the time we want to lock more than is requested + * to reduce number of calls) + */ +struct fuse_dlm_lock_out { + uint32_t locksize; + uint32_t padding; +}; + /** * Size of the ring buffer header */ From 246acd252079937f7a7f49b1a9116edbd1c2388c Mon Sep 17 00:00:00 2001 From: Yong Ze Chen Date: Tue, 8 Jul 2025 06:41:45 +0000 Subject: [PATCH 141/161] fuse: invalidate inode aliases when doing inode invalidation Add support to invalidate inode aliases when doing inode invalidation. This is useful for distributed file systems, which use DLM for cache coherency. So, when a client losts its inode lock, it should invalidate its inode cache and dentry cache since the other client may delete this file after getting inode lock. Signed-off-by: Yong Ze Chen (cherry picked from commit 49720b5c84ada61feeb09da9ad4b9a0a40694792) (cherry picked from commit d126533d27bd5165fc325e30059a4f6e581c820b) --- fs/fuse/fuse_i.h | 6 +++++ fs/fuse/inode.c | 50 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 4 ++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index afdeb18b944c41..e0e5e4bbb92a6e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -722,6 +722,12 @@ struct fuse_conn { */ unsigned handle_killpriv_v2:1; + /* invalidate inode entries when doing inode invalidation */ + unsigned inval_inode_entries:1; + + /* expire inode entries when doing inode invalidation */ + unsigned expire_inode_entries:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0e58ab24d93e9a..e5b7e9d791a15e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -541,6 +541,45 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, return NULL; } +static void fuse_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); +} + +static void fuse_invalidate_inode_entry(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + /* For directories, use d_invalidate to handle children and submounts */ + dentry = d_find_alias(inode); + if (dentry) { + d_invalidate(dentry); + fuse_invalidate_entry_cache(dentry); + dput(dentry); + } + } else { + /* For regular files, just unhash the dentry */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + } +} + int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { @@ -558,6 +597,11 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); + if (fc->inval_inode_entries) + fuse_invalidate_inode_entry(inode); + else if (fc->expire_inode_entries) + fuse_prune_aliases(inode); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -1367,6 +1411,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->io_uring = 1; if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_INVAL_INODE_ENTRY) + fc->inval_inode_entries = 1; + if (flags & FUSE_EXPIRE_INODE_ENTRY) + fc->expire_inode_entries = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1417,7 +1465,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT; + FUSE_NO_EXPORT_SUPPORT | FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index cccbdae76f8337..682abac3c9e942 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -421,6 +421,8 @@ struct fuse_file_lock { * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_OVER_IO_URING: Indicate that client supports io-uring + * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation + * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -465,6 +467,8 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_OVER_IO_URING (1ULL << 41) +#define FUSE_INVAL_INODE_ENTRY (1ULL << 60) +#define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) /** * CUSE INIT request/reply flags From 84d8ba65513e795c0ff86ab5d3981059038d8520 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Thu, 17 Jul 2025 17:04:16 +0000 Subject: [PATCH 142/161] fuse: Renumber FUSE_DLM_WB_LOCK to 100 Renumber the operation code to a high value to avoid conflicts with upstream. (cherry picked from commit 27a0e9ea714f7fcf3ee40f977be6a17c10766509) (cherry picked from commit 9cd8b25e2eea6a8d549a1a4e70d4b43618c0fbdb) --- fs/fuse/fuse_trace.h | 2 +- include/uapi/linux/fuse.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 9976e31a51a9c9..e81c93b9614627 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,7 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ - EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 682abac3c9e942..9b43b7ed829c45 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -636,7 +636,9 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, - FUSE_DLM_WB_LOCK = 53, + + /* Operations which have not been merged into upstream */ + FUSE_DLM_WB_LOCK = 100, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1177,7 +1179,7 @@ struct fuse_supp_groups { enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, - FUSE_DLM_LOCK_WRITE = 2 + FUSE_DLM_LOCK_WRITE = 2, }; /** From 6300f58ccca21edc5f179aa960ec1251a7596629 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:18:06 +0000 Subject: [PATCH 143/161] fuse: Send DLM_WB_LOCK request in page_mkwrite handler Send a DLM_WB_LOCK request in the page_mkwrite handler to enable FUSE filesystems to acquire a distributed lock manager (DLM) lock for protecting upcoming dirty pages when a previously read-only mapped page is about to be written. Signed-off-by: Cheng Ding (cherry picked from commit ec36c455214837e9ce0d3f3385a0bb50dcfb51db) (cherry picked from commit 360f6527567f3de94e59276b59904437fee397f3) --- fs/fuse/file.c | 64 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 1 + 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0fa98adc817fcb..8ed33867a4ab54 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2522,6 +2522,57 @@ static void fuse_vma_close(struct vm_area_struct *vma) mapping_set_error(vma->vm_file->f_mapping, err); } +/** + * Request a DLM lock from the FUSE server. + * + * This routine is similar to fuse_get_dlm_write_lock(), but it + * does not cache the DLM lock in the kernel. + */ +static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = ff->fm; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + if (WARN_ON_ONCE((offset & ~PAGE_MASK) || (length & ~PAGE_MASK))) + return -EIO; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = length; + inarg.type = FUSE_DLM_PAGE_MKWRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->dlm = 0; + err = 0; + } + + if (!err && outarg.locksize < length) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", + length, outarg.locksize); + fuse_abort_conn(fc); + err = -EINVAL; + } + return err; +} /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly @@ -2540,7 +2591,18 @@ static void fuse_vma_close(struct vm_area_struct *vma) static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + + if (fm->fc->dlm) { + loff_t pos = vmf->pgoff << PAGE_SHIFT; + size_t length = PAGE_SIZE; + int err = fuse_get_page_mkwrite_lock(file, pos, length); + if (err < 0) { + return vmf_error(err); + } + } file_update_time(vmf->vma->vm_file); lock_page(page); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 9b43b7ed829c45..8c8c13c0e0543a 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1180,6 +1180,7 @@ enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, FUSE_DLM_LOCK_WRITE = 2, + FUSE_DLM_PAGE_MKWRITE = 3, }; /** From 296dd1004d087f0fa6146ca32151fb8f71b691c4 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:20:08 +0000 Subject: [PATCH 144/161] fuse: Allow read_folio to retry page fault and read operations Allow read_folio to return EAGAIN error and translate it to AOP_TRUNCATE_PAGE to retry page fault and read operations. This is used to prevent deadlock of folio lock/DLM lock order reversal: - Fault or read operations acquire folio lock first, then DLM lock. - FUSE daemon blocks new DLM lock acquisition while it invalidating page cache. invalidate_inode_pages2_range() acquires folio lock To prevent deadlock, the FUSE daemon will fail its DLM lock acquisition with EAGAIN if it detects an in-flight page cache invalidating operation. Signed-off-by: Cheng Ding (cherry picked from commit 8ecf1182053891c6458b10be1272d2d562492fbd) (cherry picked from commit 02eecf926a8e520597d5396e7829e0a7fabb6bcf) --- fs/fuse/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8ed33867a4ab54..77ef4324188cb4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -879,8 +879,11 @@ static int fuse_do_readpage(struct file *file, struct page *page) fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); - if (res < 0) + if (res < 0) { + if (res == -EAGAIN) + res = AOP_TRUNCATED_PAGE; return res; + } /* * Short read means EOF. If file size is larger, truncate it */ From 36e496ef7db0c83e771acd0306d576c796f096a4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 17 Jul 2025 16:26:51 -0700 Subject: [PATCH 145/161] fuse: flush pending fuse events before aborting the connection generic/488 fails with fuse2fs in the following fashion: generic/488 _check_generic_filesystem: filesystem on /dev/sdf is inconsistent (see /var/tmp/fstests/generic/488.full for details) This test opens a large number of files, unlinks them (which really just renames them to fuse hidden files), closes the program, unmounts the filesystem, and runs fsck to check that there aren't any inconsistencies in the filesystem. Unfortunately, the 488.full file shows that there are a lot of hidden files left over in the filesystem, with incorrect link counts. Tracing fuse_request_* shows that there are a large number of FUSE_RELEASE commands that are queued up on behalf of the unlinked files at the time that fuse_conn_destroy calls fuse_abort_conn. Had the connection not aborted, the fuse server would have responded to the RELEASE commands by removing the hidden files; instead they stick around. Create a function to push all the background requests to the queue and then wait for the number of pending events to hit zero, and call this before fuse_abort_conn. That way, all the pending events are processed by the fuse server and we don't end up with a corrupt filesystem. Signed-off-by: Darrick J. Wong (cherry picked from commit d4262f9cf5232394d518207863d1ad79f52b179e) (cherry picked from commit 14e0495599b713bb053b6b0a8bb24b11d57e46f5) --- fs/fuse/dev.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 1 + 3 files changed, 45 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f4180d3ecbf924..7ddb08b96d2a25 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,6 +23,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -2147,6 +2148,43 @@ static void end_polls(struct fuse_conn *fc) } } +/* + * Flush all pending requests and wait for them. Only call this function when + * it is no longer possible for other threads to add requests. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) +{ + unsigned long deadline; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + /* Push all the background requests to the queue. */ + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + spin_unlock(&fc->lock); + + /* + * Wait 30s for all the events to complete or abort. Touch the + * watchdog once per second so that we don't trip the hangcheck timer + * while waiting for the fuse server. + */ + deadline = jiffies + timeout; + smp_mb(); + while (fc->connected && + (!timeout || time_before(jiffies, deadline)) && + wait_event_timeout(fc->blocked_waitq, + !fc->connected || atomic_read(&fc->num_waiting) == 0, + HZ) == 0) + touch_softlockup_watchdog(); +} + /* * Abort all requests. * diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e0e5e4bbb92a6e..df6200d6e160da 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1202,6 +1202,12 @@ void fuse_request_end(struct fuse_req *req); void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); +/** + * Flush all pending requests and wait for them. Takes an optional timeout + * in jiffies. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e5b7e9d791a15e..9824fd99abc14b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2019,6 +2019,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + fuse_flush_requests(fc, 30 * HZ); if (fc->destroy) fuse_send_destroy(fm); From 672f588ad0d53f054d8e23be3cbdb8e3e3dc3d8d Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 17:24:42 +0200 Subject: [PATCH 146/161] fuse: Refactor io-uring bg queue flush and queue abort This is a preparation to allow fuse-io-uring bg queue flush from flush_bg_queue() This does two function renames: fuse_uring_flush_bg -> fuse_uring_flush_queue_bg fuse_uring_abort_end_requests -> fuse_uring_flush_bg And fuse_uring_abort_end_queue_requests() is moved to fuse_uring_stop_queues(). Signed-off-by: Bernd Schubert (cherry picked from commit e70ef24251116bc7f591a9a856c371549cd5ae77) (cherry picked from commit cd90b4d97ac77eac189a6a787486fa77d4081c56) --- fs/fuse/dev_uring.c | 14 +++++++------- fs/fuse/dev_uring_i.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index d0c7e62c28e477..1decfa69ca0876 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -66,7 +66,7 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } -static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +static void fuse_uring_flush_queue_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; @@ -107,7 +107,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } @@ -136,11 +136,11 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) fuse_dev_end_requests(&req_list); } -void fuse_uring_abort_end_requests(struct fuse_ring *ring) +void fuse_uring_flush_bg(struct fuse_conn *fc) { int qid; struct fuse_ring_queue *queue; - struct fuse_conn *fc = ring->fc; + struct fuse_ring *ring = fc->ring; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); @@ -152,10 +152,9 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); - fuse_uring_abort_end_queue_requests(queue); } } @@ -545,6 +544,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) if (!queue) continue; + fuse_uring_abort_end_queue_requests(queue); fuse_uring_teardown_entries(queue); } @@ -1587,7 +1587,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); /* diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index bc89bbb5c6c38a..d7d07b4332b9da 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -145,7 +145,7 @@ struct fuse_ring { bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); -void fuse_uring_abort_end_requests(struct fuse_ring *ring); +void fuse_uring_flush_bg(struct fuse_conn *fc); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); @@ -159,7 +159,7 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) return; if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_abort_end_requests(ring); + fuse_uring_flush_bg(fc); fuse_uring_stop_queues(ring); } } From d5de605dda52ec589625aa8765ee88078cc9f567 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 18:24:41 +0200 Subject: [PATCH 147/161] fuse: Flush the io-uring bg queue from fuse_uring_flush_bg This is useful to have a unique API to flush background requests. For example when the bg queue gets flushed before the remaining of fuse_conn_destroy(). Signed-off-by: Bernd Schubert (cherry picked from commit fc4120cc58e7fbcb541bf2e9a72781b569561912) (cherry picked from commit 487773b3286e429956e05dbce66dc18c367eb7a0) --- fs/fuse/dev.c | 2 ++ fs/fuse/dev_uring.c | 3 +++ fs/fuse/dev_uring_i.h | 9 +++++++++ 3 files changed, 14 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7ddb08b96d2a25..9add15a87dd2c1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2170,6 +2170,8 @@ void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) spin_unlock(&fc->bg_lock); spin_unlock(&fc->lock); + fuse_uring_flush_bg(fc); + /* * Wait 30s for all the events to complete or abort. Touch the * watchdog once per second so that we don't trip the hangcheck timer diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 1decfa69ca0876..44bd501439d666 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -142,6 +142,9 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) struct fuse_ring_queue *queue; struct fuse_ring *ring = fc->ring; + if (!ring) + return; + for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index d7d07b4332b9da..9c1c9a7506e76f 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -207,6 +207,15 @@ static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) return false; } +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + return false; +} + +static inline void fuse_uring_flush_bg(struct fuse_conn *fc) +{ +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ From 9c0d41edffef163fb1f896f4a525173f16dab235 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 15:54:09 +0200 Subject: [PATCH 148/161] fuse: fix unnecessary connection abort in dlm lock acquiring When calling the fuse server with a dlm request and the fuse server responds with some other error than ENOSYS most likely the lock size will be set to zero. In that case the kernel will abort the fuse connection. This is completely unnecessary. Signed-off-by: Horst Birthelmer (cherry picked from commit 0bc2f9c39c52ad11a1753e5be376c424b06f43db) (cherry picked from commit bccb16970ec3f204a54a272950e4f6d40e302ec7) --- fs/fuse/fuse_dlm_cache.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index ea947f34a9f70a..a9cad2c1bd2174 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -533,19 +533,19 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, return; } - if (outarg.locksize < end - offset + 1) { - /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); - fuse_abort_conn(fc); - return; - } - if (err) return; else - /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } else { + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, offset + outarg.locksize - 1, FUSE_PAGE_LOCK_WRITE); + } } From 4e6fc61e8afa352c88ae05e11c76aef893a2a06f Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 18:15:55 +0200 Subject: [PATCH 149/161] fuse: fix connection abort on mmap when fuse server returns ENOSYS Check whether dlm is still enabled when interpreting the returned error from fuse server. Signed-off-by: Horst Birthelmer (cherry picked from commit f6fbf7c7bfb976ae2a30b4d699770a13e699ff04) (cherry picked from commit 3898d27d06b3f67960d3afc5f02a43407beca9b3) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 77ef4324188cb4..9aab482d524b63 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2567,7 +2567,7 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && outarg.locksize < length) { + if (!err && fc->dlm && outarg.locksize < length) { /* fuse server is seriously broken */ pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", length, outarg.locksize); From 9f1af20abe3e3a0dee06d65b86f222175c1b538d Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 24 Sep 2025 08:12:17 +0000 Subject: [PATCH 150/161] fuse: fix memory leak in fuse-over-io-uring argument copies Fix reference count leak of payload pages during fuse argument copies. Signed-off-by: Cheng Ding (cherry picked from commit 2436409c37b108a8fb11d1568040217169871213) --- fs/fuse/dev.c | 2 +- fs/fuse/dev_uring.c | 13 ++++++++++--- fs/fuse/fuse_dev_i.h | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9add15a87dd2c1..8c27a8827b1d16 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -687,7 +687,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, int write, } /* Unmap and put previous page of userspace buffer */ -static void fuse_copy_finish(struct fuse_copy_state *cs) +void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 44bd501439d666..e3215de2af3184 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -687,7 +687,9 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, if (ent->payload_pages) cs.ring.pages = ent->payload_pages; - return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + fuse_copy_finish(&cs); + return err; } /* @@ -734,11 +736,14 @@ static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + +copy_finish: + fuse_copy_finish(&cs); return err; } @@ -794,12 +799,14 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); +copy_finish: + fuse_copy_finish(&cs); return err ? -EFAULT : 0; } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 83fb7d44d1b686..c95bea4bec4715 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -51,6 +51,7 @@ void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, int write, struct iov_iter *iter); +void fuse_copy_finish(struct fuse_copy_state *cs); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); From f239b75a1d41247f182c0ac534699de05e4309b8 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Wed, 20 Aug 2025 16:56:43 +0200 Subject: [PATCH 151/161] fuse: change FUSE DLM_LOCK to request start and end of area - Increase the possible lock size to 64 bit. - change semantics of DLM locks to request start and end - change semantics of DLM request return to mark start and end of the locked area - better prepare dlm lock range cache rb-tree for unaligned byte range locks which could return any value as long as it is larger than the range requested - add the case where start and end are zero to destroy the cache Signed-off-by: Horst Birthelmer (cherry picked from commit 3782c4108b83a8bf4693dbe67edcf3dbe95dcc4b) --- fs/fuse/file.c | 13 +++++--- fs/fuse/fuse_dlm_cache.c | 67 +++++++++++++++++++++------------------ fs/fuse/fuse_dlm_cache.h | 12 +++---- fs/fuse/inode.c | 11 ++++--- include/uapi/linux/fuse.h | 11 ++++--- 5 files changed, 64 insertions(+), 50 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9aab482d524b63..1ae35605e629d0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2549,8 +2549,8 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.offset = offset; - inarg.size = length; + inarg.start = offset; + inarg.end = offset + length - 1; inarg.type = FUSE_DLM_PAGE_MKWRITE; args.opcode = FUSE_DLM_WB_LOCK; @@ -2567,10 +2567,13 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && fc->dlm && outarg.locksize < length) { + if (!err && + fc->dlm && + (outarg.start > inarg.start || + outarg.end < inarg.end)) { /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", - length, outarg.locksize); + pr_warn("fuse: dlm lock request for %llu:%llu bytes returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); fuse_abort_conn(fc); err = -EINVAL; } diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index a9cad2c1bd2174..d765dd8018cc6a 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -16,11 +16,11 @@ struct fuse_dlm_range { /* Interval tree node */ struct rb_node rb; /* Start page offset (inclusive) */ - pgoff_t start; + uint64_t start; /* End page offset (inclusive) */ - pgoff_t end; + uint64_t end; /* Subtree end value for interval tree */ - pgoff_t __subtree_end; + uint64_t __subtree_end; /* Lock mode */ enum fuse_page_lock_mode mode; /* Temporary list entry for operations */ @@ -32,19 +32,19 @@ struct fuse_dlm_range { #define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ /* Interval tree definitions for page ranges */ -static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +static inline uint64_t fuse_dlm_range_start(struct fuse_dlm_range *range) { return range->start; } -static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +static inline uint64_t fuse_dlm_range_last(struct fuse_dlm_range *range) { return range->end; } -INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, - fuse_dlm_range_start, fuse_dlm_range_last, static, - fuse_page_it); +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, uint64_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); /** * fuse_page_cache_init - Initialize a page cache lock manager @@ -101,8 +101,8 @@ void fuse_dlm_cache_release_locks(struct fuse_inode *inode) * Return: Pointer to the first overlapping range, or NULL if none found */ static struct fuse_dlm_range * -fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { return fuse_page_it_iter_first(&cache->ranges, start, end); } @@ -116,8 +116,8 @@ fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, * Attempt to merge ranges within and adjacent to the specified region * that have the same lock mode. */ -static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { struct fuse_dlm_range *range, *next; struct rb_node *node; @@ -182,8 +182,8 @@ static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, * * Return: 0 on success, negative error code on failure */ -int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode) +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range, *new_range, *next; @@ -191,7 +191,7 @@ int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, int ret = 0; LIST_HEAD(to_lock); LIST_HEAD(to_upgrade); - pgoff_t current_start = start; + uint64_t current_start = start; if (!cache || start > end) return -EINVAL; @@ -304,8 +304,8 @@ int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, * * Return: 0 on success, negative error code on failure */ -static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { struct fuse_dlm_range *range, *new_range; int ret = 0; @@ -363,11 +363,12 @@ static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, * @end: End page offset * * Release locks on the specified range of pages. + * Note that if start and end are set to zero the cache is destroyed. * * Return: 0 on success, negative error code on failure */ int fuse_dlm_unlock_range(struct fuse_inode *inode, - pgoff_t start, pgoff_t end) + uint64_t start, uint64_t end) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range, *next; @@ -376,6 +377,11 @@ int fuse_dlm_unlock_range(struct fuse_inode *inode, if (!cache) return -EINVAL; + if (start == 0 && end == 0) { + fuse_dlm_cache_release_locks(inode); + return 0; + } + down_write(&cache->lock); /* Find all ranges that overlap with [start, end] */ @@ -424,13 +430,13 @@ int fuse_dlm_unlock_range(struct fuse_inode *inode, * * Return: true if the entire range is locked, false otherwise */ -bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode) +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range; int lock_mode = 0; - pgoff_t current_start = start; + uint64_t current_start = start; if (!cache || start > end) return false; @@ -491,7 +497,7 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_mount *fm = ff->fm; - loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + uint64_t end = (offset + length - 1) | (PAGE_SIZE - 1); /* note that the offset and length don't have to be page aligned here * but since we only get here on writeback caching we will send out @@ -514,8 +520,8 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.offset = offset; - inarg.size = end - offset + 1; + inarg.start = offset; + inarg.end = end; inarg.type = FUSE_DLM_LOCK_WRITE; args.opcode = FUSE_DLM_WB_LOCK; @@ -536,16 +542,17 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, if (err) return; else - if (outarg.locksize < end - offset + 1) { + if (inarg.start < outarg.start || + inarg.end > outarg.end) { /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); + pr_warn("fuse: dlm lock request for %llu:%llu returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); fuse_abort_conn(fc); return; } else { /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, - offset + outarg.locksize - 1, - FUSE_PAGE_LOCK_WRITE); + fuse_dlm_lock_range(fi, outarg.start, + outarg.end, + FUSE_PAGE_LOCK_WRITE); } } diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h index 98b27a2c15d8ba..438d31d28b666e 100644 --- a/fs/fuse/fuse_dlm_cache.h +++ b/fs/fuse/fuse_dlm_cache.h @@ -32,16 +32,16 @@ int fuse_dlm_cache_init(struct fuse_inode *inode); void fuse_dlm_cache_release_locks(struct fuse_inode *inode); /* Lock a range of pages */ -int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode); +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); /* Unlock a range of pages */ -int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end); +int fuse_dlm_unlock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end); /* Check if a page range is already locked */ -bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode); +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); /* this is the interface to the filesystem */ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9824fd99abc14b..a64882b8342ba1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -612,11 +612,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = (offset + len - 1) >> PAGE_SHIFT; if (fc->dlm && fc->writeback_cache) - /* invalidate the range from the beginning of the first page - * in the given range to the last byte of the last page */ + /* Invalidate the range exactly as the fuse server requested + * except for the case where it sends -1. + * Note that this can lead to some inconsistencies if + * the fuse server sends unaligned data */ fuse_dlm_unlock_range(fi, - pg_start << PAGE_SHIFT, - (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + offset, + pg_end == -1 ? 0 : + (offset + len - 1)); invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 8c8c13c0e0543a..0497aa3c5e7c69 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1192,10 +1192,10 @@ enum fuse_dlm_lock_type { */ struct fuse_dlm_lock_in { uint64_t fh; - uint64_t offset; - uint32_t size; + uint64_t start; + uint64_t end; uint32_t type; - uint64_t reserved; + uint32_t reserved; }; /** @@ -1205,8 +1205,9 @@ struct fuse_dlm_lock_in { * to reduce number of calls) */ struct fuse_dlm_lock_out { - uint32_t locksize; - uint32_t padding; + uint64_t start; + uint64_t end; + uint64_t reserved; }; /** From 6db902736eaa3362516fa0c3ff61490bee8bcf88 Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Tue, 30 Sep 2025 03:00:46 +0800 Subject: [PATCH 152/161] Create workflow the create pr for redfs in each branch Take actions on the PR merged event of this repo. Run copy-from-linux-branch.sh and create a PR for redfs. (cherry picked from commit f54872e99c6ebccc92c202e15c22eb68c26b10f6) --- .github/workflows/create-redfs-pr.yml | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 .github/workflows/create-redfs-pr.yml diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml new file mode 100644 index 00000000000000..cd7e9717440c4e --- /dev/null +++ b/.github/workflows/create-redfs-pr.yml @@ -0,0 +1,92 @@ +# Automatially run copy-from-linux-branch.sh on branches and create PR for redfs. +name: Sync to redfs repo +on: + # Triggers the workflow on pull request merged. + pull_request: + branches: [ "*" ] + types: [ "closed" ] + +jobs: + create-redfs-pr: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + # Checks-out to a different directory to avoid following checkout removing it. + - uses: actions/checkout@v4 + with: + path: linux + + - name: Try to checkout sync-${{ github.ref_name }} if it exists + uses: actions/checkout@v4 + id: try-checkout + continue-on-error: true + with: + repository: DDNStorage/redfs + ref: sync-${{ github.ref_name }} + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Fallback to checkout main + if: steps.try-checkout.outcome == 'failure' + uses: actions/checkout@v4 + with: + repository: DDNStorage/redfs + ref: main + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Initialize git + run: | + git config --global user.name "DDNStorage RED Workflow" + git config --global user.email "red@ddn.com" + + - name: Create tracking branch based on main + if: steps.try-checkout.outcome == 'failure' + run: | + pushd redfs + git checkout -b sync-${{ github.ref_name }} + popd + + - name: Generate PR for redfs + run: | + declare -A MAP + MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.26.1.el9_6" + MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" + kerver=${MAP["${{ github.ref_name }}"]} + if [ -z ${kerver} ]; then + echo "Cannot find target kernel version" + exit 1 + fi + pushd redfs + ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} + git add src/$kerver + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} \n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg + echo -e "${{ github.sha }}" >> ../commit.msg + RET=0 + git commit -F ../commit.msg 2> ../commit.log || RET=$?; + if [ -s ../commit.log ]; then + echo "Error detcted in commit:" + cat ../commit.log + exit 1 + elif [ $RET -eq 0 ]; then + echo "Done. Push the code to remote:" + git push origin sync-${{ github.ref_name }} 2> ../push.log ||: + else + echo "No changes to existed codes. Still try with PR." + fi + if [ -s ../push.log ]; then + echo "Error detected in push:" + cat ../push.log + fi + gh pr create --base main --fill || RET=$? + if [ $RET -eq 1 ]; then + echo "No pending changes for PR, returning $RET." + fi + popd + env: + GH_TOKEN: ${{ secrets.OPENUNIXPAT }} + From 0c92477b273d151076da27c1a90778550aa2b46b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 22 Oct 2025 22:58:13 +0200 Subject: [PATCH 153/161] fuse: Invalidate the page cache after FOPEN_DIRECT_IO write generic_file_direct_write() also does this and has a large comment about. Reproducer here is xfstest's generic/209, which is exactly to have competing DIO write and cached IO read. Signed-off-by: Bernd Schubert (cherry picked from commit 04869c85e562d05feb4adefb85aa79a1e1b64aa7) (cherry picked from commit 45d035ea09119ad0c59431b4bba7aae334835bef) --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1ae35605e629d0..e6680b53614df7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1660,6 +1660,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; + if (res > 0 && write && fopen_direct_io) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, idx_from, idx_to); + } + return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); From d9d8ac35a786ff4fddfbd4daeb48e39e5556d82e Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 22 Oct 2025 23:01:27 +0200 Subject: [PATCH 154/161] fuse: Always flush the page cache before FOPEN_DIRECT_IO write This was done as condition on direct_io_allow_mmap, but I believe this is not right, as a file might be open two times - once with write-back enabled another time with FOPEN_DIRECT_IO. Signed-off-by: Bernd Schubert (cherry picked from commit 45908a9db0be3ae7512c12ab0b41b0489f5d604b) (cherry picked from commit c18d10570ec0e2c6b7d2711f6fedba090bc09e57) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e6680b53614df7..9c49e32688d329 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1586,7 +1586,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_allow_mmap) { + if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); From d3348e47b5988dc88636ebb62c14a5ca7affe889 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 2 Jun 2025 23:23:43 +0200 Subject: [PATCH 155/161] fuse: {io-uring} Add queue length counters This is another preparation and will be used for decision which queue to add a request to. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong (cherry picked from commit e4698faf912435f7f3f28c169f7bb8342d7b1edf) (cherry picked from commit 4c3941bffead914eae41be77b610befbbf2579af) --- fs/fuse/dev_uring.c | 17 +++++++++++++++-- fs/fuse/dev_uring_i.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index e3215de2af3184..e3eeebe72ab04d 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -104,13 +104,13 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + queue->nr_reqs--; if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } - spin_unlock(&queue->lock); if (error) @@ -130,6 +130,7 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); + queue->nr_reqs = 0; spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ @@ -1551,10 +1552,13 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); + queue->nr_reqs++; + if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); if (ent) @@ -1590,6 +1594,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) set_bit(FR_URING, &req->flags); req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); + queue->nr_reqs++; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); @@ -1622,8 +1627,16 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) bool fuse_uring_remove_pending_req(struct fuse_req *req) { struct fuse_ring_queue *queue = req->ring_queue; + bool removed = fuse_remove_pending_req(req, &queue->lock); + + if (removed) { + /* Update counters after successful removal */ + spin_lock(&queue->lock); + queue->nr_reqs--; + spin_unlock(&queue->lock); + } - return fuse_remove_pending_req(req, &queue->lock); + return removed; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 9c1c9a7506e76f..be8046e1b30e01 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -98,6 +98,9 @@ struct fuse_ring_queue { /* background fuse requests */ struct list_head fuse_req_bg_queue; + /* number of requests queued or in userspace */ + unsigned int nr_reqs; + struct fuse_pqueue fpq; unsigned int active_background; From 2e39f564e9a9d60ca94008e184a4230fd05ea989 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 13 Jun 2025 15:12:47 +0200 Subject: [PATCH 156/161] fuse: {io-uring} Rename ring->nr_queues to max_nr_queues This is preparation for follow up commits that allow to run with a reduced number of queues. Signed-off-by: Bernd Schubert (cherry picked from commit 2e27c33ffcf65b434ada1364a4d2ea92b094f0c3) (cherry picked from commit 32d402cb88fa0b8d484107acd4e9174b3b418d45) --- fs/fuse/dev_uring.c | 22 +++++++++++----------- fs/fuse/dev_uring_i.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index e3eeebe72ab04d..0197602126472c 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -146,7 +146,7 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -202,7 +202,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) } } - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -321,7 +321,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - ring->nr_queues = nr_queues; + ring->max_nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; atomic_set(&ring->queue_refs, 0); @@ -476,7 +476,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) int qid; struct fuse_ring_ent *ent; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) @@ -507,7 +507,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -542,7 +542,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -1054,7 +1054,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, if (!ring) return err; - if (qid >= ring->nr_queues) + if (qid >= ring->max_nr_queues) return -EINVAL; queue = ring->queues[qid]; @@ -1119,7 +1119,7 @@ static bool is_ring_ready(struct fuse_ring *ring, int current_qid) struct fuse_ring_queue *queue; bool ready = true; - for (qid = 0; qid < ring->nr_queues && ready; qid++) { + for (qid = 0; qid < ring->max_nr_queues && ready; qid++) { if (current_qid == qid) continue; @@ -1360,7 +1360,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, return err; } - if (qid >= ring->nr_queues) { + if (qid >= ring->max_nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } @@ -1491,9 +1491,9 @@ static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) qid = task_cpu(current); - if (WARN_ONCE(qid >= ring->nr_queues, + if (WARN_ONCE(qid >= ring->max_nr_queues, "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->nr_queues)) + ring->max_nr_queues)) qid = 0; queue = ring->queues[qid]; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index be8046e1b30e01..4282db8a40cbff 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -117,7 +117,7 @@ struct fuse_ring { struct fuse_conn *fc; /* number of ring queues */ - size_t nr_queues; + size_t max_nr_queues; /* maximum payload/arg size */ size_t max_payload_sz; From 0f59155a0830690c620108847924d6896f4f0096 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 10 Jun 2025 16:23:28 +0200 Subject: [PATCH 157/161] fuse: {io-uring} Use bitmaps to track registered queues Add per-CPU and per-NUMA node bitmasks to track which io-uring queues are registered. Signed-off-by: Bernd Schubert (cherry picked from commit be6edce441ecc37ee34a8937f07c01ab99bfb7f7) (cherry picked from commit 83db98796111ef2cd1a68e438fa30f91384effea) --- fs/fuse/dev_uring.c | 73 +++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 20 ++++++++++++ 2 files changed, 93 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 0197602126472c..37621f50342600 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -177,6 +177,23 @@ static void io_pages_free(struct page ***pages, int npages) *pages = NULL; } +static void fuse_ring_destruct_q_map(struct fuse_queue_map *q_map) +{ + free_cpumask_var(q_map->registered_q_mask); + kfree(q_map->cpu_to_qid); +} + +static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) +{ + int node; + + fuse_ring_destruct_q_map(&ring->q_map); + + if (ring->numa_q_map) + for (node = 0; node < ring->nr_numa_nodes; node++) + fuse_ring_destruct_q_map(&ring->numa_q_map[node]); +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -231,6 +248,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) ring->queues[qid] = NULL; } + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); fc->ring = NULL; @@ -289,6 +307,40 @@ static int fuse_register_daemon(struct fuse_ring *ring) return 0; } +static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) +{ + if (!zalloc_cpumask_var(&q_map->registered_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), + GFP_KERNEL_ACCOUNT); + + return 0; +} + +static int fuse_uring_create_q_masks(struct fuse_ring *ring) +{ + int err, node; + + err = fuse_uring_init_q_map(&ring->q_map, ring->max_nr_queues); + if (err) + return err; + + ring->numa_q_map = kcalloc(ring->nr_numa_nodes, + sizeof(*ring->numa_q_map), + GFP_KERNEL_ACCOUNT); + if (!ring->numa_q_map) + return -ENOMEM; + for (node = 0; node < ring->nr_numa_nodes; node++) { + err = fuse_uring_init_q_map(&ring->numa_q_map[node], + ring->max_nr_queues); + if (err) + return err; + } + return 0; +} + + /* * Basic ring setup for this connection based on the provided configuration */ @@ -298,11 +350,14 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; + int err; ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); if (!ring) return NULL; + ring->nr_numa_nodes = num_online_nodes(); + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), GFP_KERNEL_ACCOUNT); if (!ring->queues) @@ -311,6 +366,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max_t(unsigned int, FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + err = fuse_uring_create_q_masks(ring); + if (err) + goto out_err; + spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ @@ -333,6 +392,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return ring; out_err: + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); return res; @@ -495,6 +555,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } + spin_unlock(&queue->lock); } ring->stop_debug_log = 1; @@ -541,6 +602,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; + int node; for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); @@ -552,6 +614,13 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) fuse_uring_teardown_entries(queue); } + /* Reset all queue masks, we won't process any more IO */ + cpumask_clear(ring->q_map.registered_q_mask); + for (node = 0; node < ring->nr_numa_nodes; node++) { + if (ring->numa_q_map) + cpumask_clear(ring->numa_q_map[node].registered_q_mask); + } + if (atomic_read(&ring->queue_refs) > 0) { ring->teardown_time = jiffies; INIT_DELAYED_WORK(&ring->async_teardown_work, @@ -1149,6 +1218,10 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; struct fuse_iqueue *fiq = &fc->iq; + int node = cpu_to_node(queue->qid); + + if (WARN_ON_ONCE(node >= ring->nr_numa_nodes)) + node = 0; #ifdef IO_URING_F_CANCEL fuse_uring_prepare_cancel(cmd, issue_flags, ent); diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 4282db8a40cbff..13024891b39fb6 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -108,6 +108,17 @@ struct fuse_ring_queue { bool stopped; }; +struct fuse_queue_map { + /* Tracks which queues are registered */ + cpumask_var_t registered_q_mask; + + /* number of registered queues */ + size_t nr_queues; + + /* cpu to qid mapping */ + int *cpu_to_qid; +}; + /** * Describes if uring is for communication and holds alls the data needed * for uring communication @@ -119,6 +130,9 @@ struct fuse_ring { /* number of ring queues */ size_t max_nr_queues; + /* number of numa nodes */ + int nr_numa_nodes; + /* maximum payload/arg size */ size_t max_payload_sz; @@ -129,6 +143,12 @@ struct fuse_ring { */ unsigned int stop_debug_log : 1; + /* per numa node queue tracking */ + struct fuse_queue_map *numa_q_map; + + /* all queue tracking */ + struct fuse_queue_map q_map; + wait_queue_head_t stop_waitq; /* async tear down */ From 611432ace98218d59da4d50863a437d01a359857 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 4 Jun 2025 19:32:39 +0200 Subject: [PATCH 158/161] fuse: {io-uring} Allow reduced number of ring queues Queues selection (fuse_uring_get_queue) can handle reduced number queues - using io-uring is possible now even with a single queue and entry. The FUSE_URING_REDUCED_Q flag is being introduce tell fuse server that reduced queues are possible, i.e. if the flag is set, fuse server is free to reduce number queues. Signed-off-by: Bernd Schubert (cherry picked from commit f620f3d35969bd9a04304b757a18a11a0787dedc) (cherry picked from commit db5c36409ff73513c5a06e67ec1d844f45344051) --- fs/fuse/dev_uring.c | 124 +++++++++++++++++++++++--------------- fs/fuse/inode.c | 6 +- include/uapi/linux/fuse.h | 5 ++ 3 files changed, 84 insertions(+), 51 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 37621f50342600..3fea9e4d1921e7 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -314,15 +314,17 @@ static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), GFP_KERNEL_ACCOUNT); + if (!q_map->cpu_to_qid) + return -ENOMEM; return 0; } -static int fuse_uring_create_q_masks(struct fuse_ring *ring) +static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues) { int err, node; - err = fuse_uring_init_q_map(&ring->q_map, ring->max_nr_queues); + err = fuse_uring_init_q_map(&ring->q_map, nr_queues); if (err) return err; @@ -333,7 +335,7 @@ static int fuse_uring_create_q_masks(struct fuse_ring *ring) return -ENOMEM; for (node = 0; node < ring->nr_numa_nodes; node++) { err = fuse_uring_init_q_map(&ring->numa_q_map[node], - ring->max_nr_queues); + nr_queues); if (err) return err; } @@ -366,7 +368,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max_t(unsigned int, FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); - err = fuse_uring_create_q_masks(ring); + err = fuse_uring_create_q_masks(ring, nr_queues); if (err) goto out_err; @@ -398,12 +400,37 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return res; } +static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, + struct fuse_queue_map *q_map) +{ + int cpu, qid_idx; + size_t nr_queues; + + cpumask_set_cpu(qid, q_map->registered_q_mask); + nr_queues = cpumask_weight(q_map->registered_q_mask); + for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { + if (!q_map->cpu_to_qid) + return; + + /* + * Position of this CPU within the registered queue mask, + * handles non-contiguous CPU distributions across NUMA nodes. + */ + qid_idx = bitmap_weight( + cpumask_bits(q_map->registered_q_mask), cpu); + + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx % nr_queues, + q_map->registered_q_mask); + } +} + static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, int qid) { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; struct list_head *pq; + int node; queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); if (!queue) @@ -441,6 +468,22 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, * write_once and lock as the caller mostly doesn't take the lock at all */ WRITE_ONCE(ring->queues[qid], queue); + + /* Static mapping from cpu to per numa queues */ + node = cpu_to_node(qid); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node]); + + /* + * smp_store_release, as the variable is read without fc->lock and + * we need to avoid compiler re-ordering of updating the nr_queues + * and setting ring->numa_queues[node].cpu_to_qid above + */ + smp_store_release (&ring->numa_q_map[node].nr_queues, + ring->numa_q_map[node].nr_queues + 1); + + /* global mapping */ + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map); + spin_unlock(&fc->lock); return queue; @@ -1182,31 +1225,6 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -static bool is_ring_ready(struct fuse_ring *ring, int current_qid) -{ - int qid; - struct fuse_ring_queue *queue; - bool ready = true; - - for (qid = 0; qid < ring->max_nr_queues && ready; qid++) { - if (current_qid == qid) - continue; - - queue = ring->queues[qid]; - if (!queue) { - ready = false; - break; - } - - spin_lock(&queue->lock); - if (list_empty(&queue->ent_avail_queue)) - ready = false; - spin_unlock(&queue->lock); - } - - return ready; -} - /* * fuse_uring_req_fetch command handling */ @@ -1233,13 +1251,9 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, spin_unlock(&queue->lock); if (!ring->ready) { - bool ready = is_ring_ready(ring, queue->qid); - - if (ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); } } @@ -1557,22 +1571,36 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) { unsigned int qid; - struct fuse_ring_queue *queue; + int node; + unsigned int nr_queues; + unsigned int cpu = task_cpu(current); - qid = task_cpu(current); + cpu = cpu % ring->max_nr_queues; - if (WARN_ONCE(qid >= ring->max_nr_queues, - "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->max_nr_queues)) - qid = 0; + /* numa local registered queue bitmap */ + node = cpu_to_node(cpu); + if (WARN_ONCE(node >= ring->nr_numa_nodes, + "Node number (%d) exceeds nr nodes (%d)\n", + node, ring->nr_numa_nodes)) { + node = 0; + } - queue = ring->queues[qid]; - WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); + if (nr_queues) { + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + return READ_ONCE(ring->queues[qid]); + } - return queue; + /* global registered queue bitmap */ + qid = ring->q_map.cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + return READ_ONCE(ring->queues[qid]); } static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) @@ -1612,7 +1640,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring); if (!queue) goto err; @@ -1654,7 +1682,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring); if (!queue) return false; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a64882b8342ba1..7195c50e3b22d6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1455,8 +1455,7 @@ void fuse_send_init(struct fuse_mount *fm) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - flags = - FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -1468,7 +1467,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY; + FUSE_NO_EXPORT_SUPPORT | FUSE_INVAL_INODE_ENTRY | + FUSE_EXPIRE_INODE_ENTRY | FUSE_URING_REDUCED_Q; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 0497aa3c5e7c69..437f349076aed8 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -423,6 +423,8 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation + * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free + * to register between 1 and nr-core io-uring queues */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -467,6 +469,9 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_OVER_IO_URING (1ULL << 41) + +#define FUSE_ALIGN_PG_ORDER (1ULL << 50) +#define FUSE_URING_REDUCED_Q (1ULL << 59) #define FUSE_INVAL_INODE_ENTRY (1ULL << 60) #define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) From 38c0ef7b1b43210c6c09625fe9374b542df87c28 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 24 Sep 2025 19:14:19 +0200 Subject: [PATCH 159/161] fuse: {io-uring} Queue background requests on a different core Running background IO on a different core makes quite a difference. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=272MiB/s (285MB/s) ... patched READ: bw=650MiB/s (682MB/s) Reason is easily visible, the fio process is migrating between CPUs when requests are submitted on the queue for the same core. With --iodepth=8 unpatched READ: bw=466MiB/s (489MB/s) patched READ: bw=641MiB/s (672MB/s) Without io-uring (--iodepth=8) READ: bw=729MiB/s (764MB/s) Without fuse (--iodepth=8) READ: bw=2199MiB/s (2306MB/s) (Test were done with /example/passthrough_hp -o allow_other --nopassthrough \ [-o io_uring] /tmp/source /tmp/dest ) Additional notes: With FURING_NEXT_QUEUE_RETRIES=0 (--iodepth=8) READ: bw=903MiB/s (946MB/s) With just a random qid (--iodepth=8) READ: bw=429MiB/s (450MB/s) With --iodepth=1 unpatched READ: bw=195MiB/s (204MB/s) patched READ: bw=232MiB/s (243MB/s) With --iodepth=1 --numjobs=2 unpatched READ: bw=366MiB/s (384MB/s) patched READ: bw=472MiB/s (495MB/s) With --iodepth=1 --numjobs=8 unpatched READ: bw=1437MiB/s (1507MB/s) patched READ: bw=1529MiB/s (1603MB/s) fuse without io-uring READ: bw=1314MiB/s (1378MB/s), 1314MiB/s-1314MiB/s ... no-fuse READ: bw=2566MiB/s (2690MB/s), 2566MiB/s-2566MiB/s ... In summary, for async requests the core doing application IO is busy sending requests and processing IOs should be done on a different core. Spreading the load on random cores is also not desirable, as the core might be frequency scaled down and/or in C1 sleep states. Not shown here, but differnces are much smaller when the system uses performance govenor instead of schedutil (ubuntu default). Obviously at the cost of higher system power consumption for performance govenor - not desirable either. Results without io-uring (which uses fixed libfuse threads per queue) heavily depend on the current number of active threads. Libfuse uses default of max 10 threads, but actual nr max threads is a parameter. Also, no-fuse-io-uring results heavily depend on, if there was already running another workload before, as libfuse starts these threads dynamically - i.e. the more threads are active, the worse the performance. Signed-off-by: Bernd Schubert (cherry picked from commit c6399ea79b104ac79758f2c36f1977b80a02358d) (cherry picked from commit 44eaf774fd26a9d5608c93a7db90139e314514f2) --- fs/fuse/dev_uring.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 3fea9e4d1921e7..be053943eddd0b 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1571,13 +1571,21 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, + bool background) { unsigned int qid; int node; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + /* + * Background requests result in better performance on a different + * CPU, unless CPUs are already busy. + */ + if (background) + cpu++; + cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1640,7 +1648,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, false); if (!queue) goto err; @@ -1682,7 +1690,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, true); if (!queue) return false; From 1f1b69d23a01ddf82ad99ffcf7c4ac5a8880e87b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 24 Oct 2025 19:05:07 +0200 Subject: [PATCH 160/161] fuse: Add retry attempts for numa local queues for load distribution This is to further improve performance. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=650MiB/s (682MB/s) patched: READ: bw=995MiB/s (1043MB/s) with --iodepth=8 unpatched READ: bw=641MiB/s (672MB/s) patched READ: bw=966MiB/s (1012MB/s) Reason is that with --iodepth=x (x > 1) fio submits multiple async requests and a single queue might become CPU limited. I.e. spreading the load helps. (cherry picked from commit 2e73b0be1f55d61c2d861a12bf6bb9963b9b877a) (cherry picked from commit 7081fec641cf7dc1ef04bb1f7cdb1502c8efd5b6) --- fs/fuse/dev_uring.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index be053943eddd0b..bfe4a2174f20cf 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -23,6 +23,8 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +#define FUSE_URING_Q_THRESHOLD 2 + /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) @@ -1575,9 +1577,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node; + int node, retries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + struct fuse_ring_queue *queue, *primary_queue = NULL; /* * Background requests result in better performance on a different @@ -1586,6 +1589,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, if (background) cpu++; +retry: cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1601,12 +1605,35 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - return READ_ONCE(ring->queues[qid]); + queue = READ_ONCE(ring->queues[qid]); + + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; + + if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) + return queue; + + /* Retries help for load balancing */ + if (retries < FUSE_URING_Q_THRESHOLD) { + if (!retries) + primary_queue = queue; + + /* Increase cpu, assuming it will map to a differet qid*/ + cpu++; + retries++; + goto retry; + } } + /* Retries exceeded, take the primary target queue */ + if (primary_queue) + return primary_queue; + /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + /* Might happen on teardown */ return NULL; return READ_ONCE(ring->queues[qid]); } From 2407a55beb012359f1532bf50baf53c40c7019b8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 10 Nov 2025 13:17:38 +0100 Subject: [PATCH 161/161] fuse: Fetch a queued fuse request on command registration With the reduced queue feature io-uring is marked as ready after receiving the 1st ring entry. At this time other queues just might be in the process of registration and then a race happens fuse_uring_queue_fuse_req -> no queue entry registered yet list_add_tail -> fuse request gets queued So far fetching requests from the list only happened from FUSE_IO_URING_CMD_COMMIT_AND_FETCH, but without new requests on the same queue, it would actually never send requests from that queue - the request was stuck. (cherry picked from commit 4cbee2ecb11b6d971412b732affb9b43ce2e289b) --- fs/fuse/dev_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index bfe4a2174f20cf..6a7f90c88479ac 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1472,6 +1472,8 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; }