diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index c9d2bf96b02d6b..660dbaf0b9b8cd 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -365,8 +365,8 @@ pointed by REDIRECT. This should not be possible on local system as setting "trusted." xattrs will require CAP_SYS_ADMIN. But it should be possible for untrusted layers like from a pen drive. -Note: redirect_dir={off|nofollow|follow[*]} conflicts with metacopy=on, and -results in an error. +Note: redirect_dir={off|nofollow|follow[*]} and nfs_export=on mount options +conflict with metacopy=on, and will result in an error. [*] redirect_dir=follow only conflicts with metacopy=on if upperdir=... is given. @@ -560,6 +560,9 @@ When the NFS export feature is enabled, all directory index entries are verified on mount time to check that upper file handles are not stale. This verification may cause significant overhead in some cases. +Note: the mount options index=off,nfs_export=on are conflicting and will +result in an error. + Testsuite --------- diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 26c09396957363..867036aa90b839 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -858,3 +858,10 @@ be misspelled d_alloc_anon(). [should've been added in 2016] stale comment in finish_open() nonwithstanding, failure exits in ->atomic_open() instances should *NOT* fput() the file, no matter what. Everything is handled by the caller. + +--- + +**mandatory** + +clone_private_mount() returns a longterm mount now, so the proper destructor of +its result is kern_unmount() or kern_unmount_array(). diff --git a/Documentation/filesystems/virtiofs.rst b/Documentation/filesystems/virtiofs.rst index e06e4951cb3953..fd4d2484e9497c 100644 --- a/Documentation/filesystems/virtiofs.rst +++ b/Documentation/filesystems/virtiofs.rst @@ -39,6 +39,20 @@ Mount file system with tag ``myfs`` on ``/mnt``: Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU and the virtiofsd daemon. +Mount options +------------- + +virtiofs supports general VFS mount options, for example, remount, +ro, rw, context, etc. It also supports FUSE mount options. + +atime behavior +^^^^^^^^^^^^^^ + +The atime-related mount options, for example, noatime, strictatime, +are ignored. The atime behavior for virtiofs is the same as the +underlying filesystem of the directory that has been exported +on the host. + Internals ========= Since the virtio-fs device uses the FUSE protocol for file system requests, the diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 25cbe0aeeec506..aa1d34141ea38f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -980,7 +980,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, if (!IS_ERR_OR_NULL(inode)) fid = AFS_FS_I(inode)->fid; - _debug("splice %px", dentry->d_inode); + _debug("splice %p", dentry->d_inode); d = d_splice_alias(inode, dentry); if (!IS_ERR_OR_NULL(d)) { d->d_fsdata = dentry->d_fsdata; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 70e518f7bc19f2..71eea2a908c705 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -71,7 +71,7 @@ static void afs_schedule_lock_extension(struct afs_vnode *vnode) void afs_lock_op_done(struct afs_call *call) { struct afs_operation *op = call->op; - struct afs_vnode *vnode = op->lock.lvnode; + struct afs_vnode *vnode = op->file[0].vnode; if (call->error == 0) { spin_lock(&vnode->lock); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 7dde703df40c48..cd0a0060950bea 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -538,7 +538,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) * mark the data attached to an inode as obsolete due to a write on the server * - might also want to ditch all the outstanding writes and dirty pages */ -void afs_zap_data(struct afs_vnode *vnode) +static void afs_zap_data(struct afs_vnode *vnode) { _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e1621b0670cc4b..0c9806ef2a19fa 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -795,7 +795,6 @@ struct afs_operation { struct afs_read *req; } fetch; struct { - struct afs_vnode *lvnode; /* vnode being locked */ afs_lock_type_t type; } lock; struct { @@ -1070,7 +1069,6 @@ extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); -extern void afs_zap_data(struct afs_vnode *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 22d00cf1913d1e..e817fc740ba019 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -567,6 +567,7 @@ void afs_put_sysnames(struct afs_sysnames *sysnames) if (sysnames->subs[i] != afs_init_sysname && sysnames->subs[i] != sysnames->blank) kfree(sysnames->subs[i]); + kfree(sysnames); } } diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 093895c49c2198..5082ef04e99c5c 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -28,7 +28,7 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k }; volume = afs_create_volume(&fc); - _leave(" = %px", volume); + _leave(" = %p", volume); return volume; } @@ -73,7 +73,8 @@ static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a, } default: - BUG(); + WARN_ON(1); + diff = 1; } out: diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index b0a6e40b4da3c4..52d5af5fcd44be 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -15,8 +15,6 @@ #include "xdr_fs.h" #include "protocol_yfs.h" -static const struct afs_fid afs_zero_fid; - #define xdr_size(x) (sizeof(*x) / sizeof(__be32)) static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8ccc97356cb570..02b3c36b36766a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -342,7 +342,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) list_add_tail(&req->intr_entry, &fiq->interrupts); /* * Pairs with smp_mb() implied by test_and_set_bit() - * from request_end(). + * from fuse_request_end(). */ smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { @@ -764,16 +764,15 @@ static int fuse_check_page(struct page *page) { if (page_mapcount(page) || page->mapping != NULL || - page_count(page) != 1 || (page->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | - 1 << PG_reclaim))) { - pr_warn("trying to steal weird page\n"); - pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + 1 << PG_reclaim | + 1 << PG_waiters))) { + dump_page(page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -1977,8 +1976,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - BUG_ON(nbuf >= pipe->ring_size); - BUG_ON(tail == head); + if (WARN_ON(nbuf >= count || tail == head)) + goto out_free; + ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index de1e2fde60bd4c..26f028bc760b2b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1689,8 +1689,18 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(fc)) { + if (!request_mask) { + /* + * If user explicitly requested *nothing* then don't + * error out, but return st_dev only. + */ + stat->result_mask = 0; + stat->dev = inode->i_sb->s_dev; + return 0; + } return -EACCES; + } return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bac51c32d66026..e573b0cd2737dc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct list_head writepages_entry; + struct rb_node writepages_entry; struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; @@ -366,17 +366,23 @@ struct fuse_writepage_args { static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_writepage_args *wpa; + struct rb_node *n; + + n = fi->writepages.rb_node; - list_for_each_entry(wpa, &fi->writepages, writepages_entry) { + while (n) { + struct fuse_writepage_args *wpa; pgoff_t curr_index; + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + wpa->ia.ap.num_pages && - curr_index <= idx_to) { + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else return wpa; - } } return NULL; } @@ -445,9 +451,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (is_bad_inode(inode)) return -EIO; - if (fc->no_flush) - return 0; - err = write_inode_now(inode, 1); if (err) return err; @@ -460,6 +463,10 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; + err = 0; + if (fc->no_flush) + goto inval_attr_out; + memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -475,6 +482,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) fc->no_flush = 1; err = 0; } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fc->writeback_cache) + fuse_invalidate_attr(inode); return err; } @@ -712,6 +727,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, spin_unlock(&io->lock); ia->ap.args.end = fuse_aio_complete_req; + ia->ap.args.may_block = io->should_dirty; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); if (err) fuse_aio_complete_req(fc, &ia->ap.args, err); @@ -1570,7 +1586,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&wpa->writepages_entry); + rb_erase(&wpa->writepages_entry, &fi->writepages); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1658,6 +1674,36 @@ __acquires(fi->lock) } } +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return (void) WARN_ON(true); + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); +} + static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, int error) { @@ -1676,7 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, wpa->next = next->next; next->next = NULL; next->ia.ff = fuse_file_get(wpa->ia.ff); - list_add(&next->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, next); /* * Skip fuse_flush_writepages() to make it easy to crop requests @@ -1811,7 +1857,7 @@ static int fuse_writepage_locked(struct page *page) inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1923,10 +1969,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_wpa->writepages_entry); + rb_erase(&new_wpa->writepages_entry, &fi->writepages); old_wpa = fuse_find_writeback(fi, page->index, page->index); if (!old_wpa) { - list_add(&new_wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); return false; } @@ -2041,7 +2087,7 @@ static int fuse_writepages_fill(struct page *page, wpa->inode = inode; spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); spin_unlock(&fi->lock); data->wpa = wpa; @@ -3235,13 +3281,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; - if (fc->writeback_cache) { - inode_lock(inode_in); - err = fuse_writeback_range(inode_in, pos_in, pos_in + len); - inode_unlock(inode_in); - if (err) - return err; - } + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; inode_lock(inode_out); @@ -3249,11 +3293,27 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; - if (fc->writeback_cache) { - err = fuse_writeback_range(inode_out, pos_out, pos_out + len); - if (err) - goto out; - } + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a i_mmap_sem style lock could be used to prevent new + * faults while the copy is ongoing. + */ + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); @@ -3274,6 +3334,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + if (fc->writeback_cache) { fuse_write_update_size(inode_out, pos_out + outarg.size); file_update_time(file_out); @@ -3351,5 +3415,5 @@ void fuse_init_file_inode(struct inode *inode) INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ca344bf714045a..740a8a7d7ae6f0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -111,7 +111,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /* List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -249,6 +249,7 @@ struct fuse_args { bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; + bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 95d712d44ca13a..5b4aebf5821fea 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -321,6 +321,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, loff_t offset, loff_t len) { + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; @@ -329,6 +331,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, if (!inode) return -ENOENT; + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -1113,7 +1120,7 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { - struct fuse_dev *fud; + struct fuse_dev *fud = NULL; struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; struct dentry *root_dentry; @@ -1155,9 +1162,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fud = fuse_dev_alloc_install(fc); - if (!fud) - goto err; + if (ctx->fudptr) { + err = -ENOMEM; + fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err; + } fc->dev = sb->s_dev; fc->sb = sb; @@ -1191,7 +1201,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (*ctx->fudptr) + if (ctx->fudptr && *ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1200,7 +1210,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - *ctx->fudptr = fud; + if (ctx->fudptr) + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); return 0; @@ -1208,7 +1219,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_unlock(&fuse_mutex); dput(root_dentry); err_dev_free: - fuse_dev_free(fud); + if (fud) + fuse_dev_free(fud); err: return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bade7476890333..4c4ef5d6929813 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -60,6 +60,12 @@ struct virtio_fs_forget { struct virtio_fs_forget_req req; }; +struct virtio_fs_req_work { + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + struct work_struct done_work; +}; + static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); @@ -485,19 +491,67 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) } /* Work function for request completion */ +static void virtio_fs_request_complete(struct fuse_req *req, + struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_conn *fc = fsvq->fud->fc; + struct fuse_args *args; + struct fuse_args_pages *ap; + unsigned int len, i, thislen; + struct page *page; + + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); + + fuse_request_end(fc, req); + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_complete_req_work(struct work_struct *work) +{ + struct virtio_fs_req_work *w = + container_of(work, typeof(*w), done_work); + + virtio_fs_request_complete(w->req, w->fsvq); + kfree(w); +} + static void virtio_fs_requests_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, done_work); struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct virtqueue *vq = fsvq->vq; struct fuse_req *req; - struct fuse_args_pages *ap; struct fuse_req *next; - struct fuse_args *args; - unsigned int len, i, thislen; - struct page *page; + unsigned int len; LIST_HEAD(reqs); /* Collect completed requests off the virtqueue */ @@ -515,38 +569,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work) /* End requests */ list_for_each_entry_safe(req, next, &reqs, list) { - /* - * TODO verify that server properly follows FUSE protocol - * (oh.uniq, oh.len) - */ - args = req->args; - copy_args_from_argbuf(args, req); - - if (args->out_pages && args->page_zeroing) { - len = args->out_args[args->out_numargs - 1].size; - ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { - thislen = ap->descs[i].length; - if (len < thislen) { - WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); - len = 0; - } else { - len -= thislen; - } - } - } - - spin_lock(&fpq->lock); - clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - spin_unlock(&fpq->lock); - fuse_request_end(fc, req); - spin_lock(&fsvq->lock); - dec_in_flight_req(fsvq); - spin_unlock(&fsvq->lock); + /* blocking async request completes in a worker context */ + if (req->args->may_block) { + struct virtio_fs_req_work *w; + + w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL); + INIT_WORK(&w->done_work, virtio_fs_complete_req_work); + w->fsvq = fsvq; + w->req = req; + schedule_work(&w->done_work); + } else { + virtio_fs_request_complete(req, fsvq); + } } } @@ -1067,7 +1103,7 @@ static int virtio_fs_fill_super(struct super_block *sb) err = -ENOMEM; /* Allocate fuse_dev for hiprio and notification queues */ - for (i = 0; i < VQ_REQUEST; i++) { + for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; fsvq->fud = fuse_dev_alloc(); @@ -1075,18 +1111,15 @@ static int virtio_fs_fill_super(struct super_block *sb) goto err_free_fuse_devs; } - ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; + /* virtiofs allocates and installs its own fuse devices */ + ctx.fudptr = NULL; err = fuse_fill_super_common(sb, &ctx); if (err < 0) goto err_free_fuse_devs; - fc = fs->vqs[VQ_REQUEST].fud->fc; - for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; - if (i == VQ_REQUEST) - continue; /* already initialized */ fuse_dev_install(fsvq->fud, fc); } diff --git a/fs/namespace.c b/fs/namespace.c index 6d499ab254b717..7cd64240916573 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1937,6 +1937,9 @@ struct vfsmount *clone_private_mount(const struct path *path) if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); + /* Longterm mount to be removed by kern_unmount*() */ + new_mnt->mnt_ns = MNT_NS_INTERNAL; + return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -3863,6 +3866,19 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (mnt[i]) + real_mount(mnt[i])->mnt_ns = NULL; + synchronize_rcu_expedited(); + for (i = 0; i < num; i++) + mntput(mnt[i]); +} +EXPORT_SYMBOL(kern_unmount_array); + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9709cf22cab3cd..79dd052c7dbf5a 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -47,7 +47,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) { ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; - int uninitialized_var(error); + int error = 0; size_t slen; if (!(old->d_inode->i_opflags & IOP_XATTR) || @@ -584,9 +584,10 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) .link = c->link }; - err = ovl_lock_rename_workdir(c->workdir, c->destdir); - if (err) - return err; + /* workdir and destdir could be the same when copying up to indexdir */ + err = -EIO; + if (lock_rename(c->workdir, c->destdir) != NULL) + goto unlock; err = ovl_prep_cu_creds(c->dentry, &cc); if (err) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 279009dee3669a..1bba4813f9cb00 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -62,35 +62,59 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir) } /* caller holds i_mutex on workdir */ -static struct dentry *ovl_whiteout(struct dentry *workdir) +static struct dentry *ovl_whiteout(struct ovl_fs *ofs) { int err; struct dentry *whiteout; + struct dentry *workdir = ofs->workdir; struct inode *wdir = workdir->d_inode; - whiteout = ovl_lookup_temp(workdir); - if (IS_ERR(whiteout)) - return whiteout; + if (!ofs->whiteout) { + whiteout = ovl_lookup_temp(workdir); + if (IS_ERR(whiteout)) + goto out; - err = ovl_do_whiteout(wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + goto out; + } + ofs->whiteout = whiteout; } + if (ofs->share_whiteout) { + whiteout = ovl_lookup_temp(workdir); + if (IS_ERR(whiteout)) + goto out; + + err = ovl_do_link(ofs->whiteout, wdir, whiteout); + if (!err) + goto out; + + if (err != -EMLINK) { + pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n", + ofs->whiteout->d_inode->i_nlink, err); + ofs->share_whiteout = false; + } + dput(whiteout); + } + whiteout = ofs->whiteout; + ofs->whiteout = NULL; +out: return whiteout; } /* Caller must hold i_mutex on both workdir and dir */ -int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - struct inode *wdir = workdir->d_inode; + struct inode *wdir = ofs->workdir->d_inode; struct dentry *whiteout; int err; int flags = 0; - whiteout = ovl_whiteout(workdir); + whiteout = ovl_whiteout(ofs); err = PTR_ERR(whiteout); if (IS_ERR(whiteout)) return err; @@ -262,6 +286,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, inode = ovl_get_inode(dentry->d_sb, &oip); if (IS_ERR(inode)) return PTR_ERR(inode); + if (inode == oip.newinode) + ovl_set_flag(OVL_UPPERDATA, inode); } else { WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); dput(newdentry); @@ -715,6 +741,7 @@ static bool ovl_matches_upper(struct dentry *dentry, struct dentry *upper) static int ovl_remove_and_whiteout(struct dentry *dentry, struct list_head *list) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct dentry *upper; @@ -748,7 +775,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, goto out_dput_upper; } - err = ovl_cleanup_and_whiteout(workdir, d_inode(upperdir), upper); + err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper); if (err) goto out_d_drop; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index ed5c1078919ccb..8f4286450f92a5 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -204,7 +204,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && ofs->upper_mnt) + if (d_is_dir(dentry) && ovl_upper_mnt(ofs)) return ovl_connect_layer(dentry); /* Lower file handle for indexed and non-upper dir/non-dir */ @@ -231,12 +231,9 @@ static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) if (IS_ERR(fh)) return PTR_ERR(fh); - err = -EOVERFLOW; len = OVL_FH_LEN(fh); - if (len > buflen) - goto fail; - - memcpy(fid, fh, len); + if (len <= buflen) + memcpy(fid, fh, len); err = len; out: @@ -244,9 +241,8 @@ static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) return err; fail: - pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", - dentry, err, buflen, fh ? (int)fh->fb.len : 0, - fh ? fh->fb.type : 0); + pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n", + dentry, err); goto out; } @@ -254,7 +250,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct dentry *dentry; - int bytes = *max_len << 2; + int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) @@ -264,12 +260,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - bytes = ovl_dentry_to_fid(dentry, fid, bytes); + bytes = ovl_dentry_to_fid(dentry, fid, buflen); dput(dentry); if (bytes <= 0) return FILEID_INVALID; *max_len = bytes >> 2; + if (bytes > buflen) + return FILEID_INVALID; return OVL_FILEID_V1; } @@ -679,10 +677,10 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, struct dentry *dentry; struct dentry *upper; - if (!ofs->upper_mnt) + if (!ovl_upper_mnt(ofs)) return ERR_PTR(-EACCES); - upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); + upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true); if (IS_ERR_OR_NULL(upper)) return upper; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 87c362f65448b9..01820e654a2192 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "overlayfs.h" @@ -39,10 +40,22 @@ static struct file *ovl_open_realfile(const struct file *file, struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY; + int acc_mode = ACC_MODE(flags); + int err; + + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - realfile = open_with_fake_path(&file->f_path, flags, realinode, - current_cred()); + err = inode_permission(realinode, MAY_OPEN | acc_mode); + if (err) { + realfile = ERR_PTR(err); + } else if (!inode_owner_or_capable(realinode)) { + realfile = ERR_PTR(-EPERM); + } else { + realfile = open_with_fake_path(&file->f_path, flags, realinode, + current_cred()); + } revert_creds(old_cred); pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", @@ -219,9 +232,8 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) +static rwf_t ovl_iocb_to_rwf(int ifl) { - int ifl = iocb->ki_flags; rwf_t flags = 0; if (ifl & IOCB_NOWAIT) @@ -283,7 +295,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + ovl_iocb_to_rwf(iocb->ki_flags)); } else { struct ovl_aio_req *aio_req; @@ -336,7 +348,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (is_sync_kiocb(iocb)) { file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + ovl_iocb_to_rwf(iocb->ki_flags)); file_end_write(real.file); /* Update size */ ovl_copyattr(ovl_inode_real(inode), inode); @@ -520,7 +532,9 @@ static long ovl_real_ioctl(struct file *file, unsigned int cmd, return ret; old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_ioctl(real.file, cmd, arg); + ret = security_file_ioctl(real.file, cmd, arg); + if (!ret) + ret = vfs_ioctl(real.file, cmd, arg); revert_creds(old_cred); fdput(real); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 7af76b9004ebbd..8be6cd264f6650 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -457,7 +457,7 @@ int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) if (flags & S_ATIME) { struct ovl_fs *ofs = inode->i_sb->s_fs_info; struct path upperpath = { - .mnt = ofs->upper_mnt, + .mnt = ovl_upper_mnt(ofs), .dentry = ovl_upperdentry_dereference(OVL_I(inode)), }; @@ -905,7 +905,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) * Does overlay inode need to be hashed by lower inode? */ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, - struct dentry *lower, struct dentry *index) + struct dentry *lower, bool index) { struct ovl_fs *ofs = sb->s_fs_info; @@ -918,7 +918,7 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, return true; /* Yes, if won't be copied up */ - if (!ofs->upper_mnt) + if (!ovl_upper_mnt(ofs)) return true; /* No, if lower hardlink is or will be broken on copy up */ @@ -954,7 +954,7 @@ struct inode *ovl_get_inode(struct super_block *sb, bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); int fsid = bylower ? lowerpath->layer->fsid : 0; - bool is_dir, metacopy = false; + bool is_dir; unsigned long ino = 0; int err = oip->newinode ? -EEXIST : -ENOMEM; @@ -1015,15 +1015,6 @@ struct inode *ovl_get_inode(struct super_block *sb, if (oip->index) ovl_set_flag(OVL_INDEX, inode); - if (upperdentry) { - err = ovl_check_metacopy_xattr(upperdentry); - if (err < 0) - goto out_err; - metacopy = err; - if (!metacopy) - ovl_set_flag(OVL_UPPERDATA, inode); - } - OVL_I(inode)->redirect = oip->redirect; if (bylower) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0db23baf98e7c5..3566282a9199cb 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -191,16 +191,36 @@ static bool ovl_is_opaquedir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); } +static struct dentry *ovl_lookup_positive_unlocked(const char *name, + struct dentry *base, int len, + bool drop_negative) +{ + struct dentry *ret = lookup_one_len_unlocked(name, base, len); + + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + if (drop_negative && ret->d_lockref.count == 1) { + spin_lock(&ret->d_lock); + /* Recheck condition under lock */ + if (d_is_negative(ret) && ret->d_lockref.count == 1) + __d_drop(ret); + spin_unlock(&ret->d_lock); + } + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} + static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, const char *name, unsigned int namelen, size_t prelen, const char *post, - struct dentry **ret) + struct dentry **ret, bool drop_negative) { struct dentry *this; int err; bool last_element = !post[0]; - this = lookup_positive_unlocked(name, base, namelen); + this = ovl_lookup_positive_unlocked(name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -276,7 +296,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, - struct dentry **ret) + struct dentry **ret, bool drop_negative) { /* Counting down from the end, since the prefix can change */ size_t rem = d->name.len - 1; @@ -285,7 +305,7 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, if (d->name.name[0] != '/') return ovl_lookup_single(base, d, d->name.name, d->name.len, - 0, "", ret); + 0, "", ret, drop_negative); while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { const char *s = d->name.name + d->name.len - rem; @@ -298,7 +318,8 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, return -EIO; err = ovl_lookup_single(base, d, s, thislen, - d->name.len - rem, next, &base); + d->name.len - rem, next, &base, + drop_negative); dput(dentry); if (err) return err; @@ -468,7 +489,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); + upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -484,12 +505,6 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) return upper; } -/* Is this a leftover from create/whiteout of directory index entry? */ -static bool ovl_is_temp_index(struct dentry *index) -{ - return index->d_name.name[0] == '#'; -} - /* * Verify that an index entry name matches the origin file handle stored in * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. @@ -507,11 +522,6 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) if (!d_inode(index)) return 0; - /* Cleanup leftover from index create/cleanup attempt */ - err = -ESTALE; - if (ovl_is_temp_index(index)) - goto fail; - err = -EINVAL; if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; @@ -823,7 +833,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, struct dentry *this; unsigned int i; int err; - bool metacopy = false; + bool uppermetacopy = false; struct ovl_lookup_data d = { .sb = dentry->d_sb, .name = dentry->d_name, @@ -841,7 +851,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - err = ovl_lookup_layer(upperdir, &d, &upperdentry); + err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -869,7 +879,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put_upper; if (d.metacopy) - metacopy = true; + uppermetacopy = true; } if (d.redirect) { @@ -899,13 +909,19 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else d.last = lower.layer->idx == roe->numlower; - err = ovl_lookup_layer(lower.dentry, &d, &this); + err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; if (!this) continue; + if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { + err = -EPERM; + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); + goto out_put; + } + /* * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". @@ -940,21 +956,21 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, origin = this; } - if (d.metacopy) - metacopy = true; - /* - * Do not store intermediate metacopy dentries in chain, - * except top most lower metacopy dentry - */ if (d.metacopy && ctr) { + /* + * Do not store intermediate metacopy dentries in + * lower chain, except top most lower metacopy dentry. + * Continue the loop so that if there is an absolute + * redirect on this dentry, poe can be reset to roe. + */ dput(this); - continue; + this = NULL; + } else { + stack[ctr].dentry = this; + stack[ctr].layer = lower.layer; + ctr++; } - stack[ctr].dentry = this; - stack[ctr].layer = lower.layer; - ctr++; - /* * Following redirects can have security consequences: it's like * a symlink into the lower layer without the permission checks. @@ -982,22 +998,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } - if (metacopy) { - /* - * Found a metacopy dentry but did not find corresponding - * data dentry - */ - if (d.metacopy) { - err = -EIO; - goto out_put; - } - - err = -EPERM; - if (!ofs->config.metacopy) { - pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", - dentry); - goto out_put; - } + /* + * For regular non-metacopy upper dentries, there is no lower + * path based lookup, hence ctr will be zero. If a dentry is found + * using ORIGIN xattr on upper, install it in stack. + * + * For metacopy dentry, path based lookup will find lower dentries. + * Just make sure a corresponding data dentry has been found. + */ + if (d.metacopy || (uppermetacopy && !ctr)) { + err = -EIO; + goto out_put; } else if (!d.is_dir && upperdentry && !ctr && origin_path) { if (WARN_ON(stack != NULL)) { err = -EIO; @@ -1005,25 +1016,30 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } stack = origin_path; ctr = 1; + origin = origin_path->dentry; origin_path = NULL; } /* - * Lookup index by lower inode and verify it matches upper inode. - * We only trust dir index if we verified that lower dir matches - * origin, otherwise dir index entries may be inconsistent and we - * ignore them. + * Always lookup index if there is no-upperdentry. + * + * For the case of upperdentry, we have set origin by now if it + * needed to be set. There are basically three cases. + * + * For directories, lookup index by lower inode and verify it matches + * upper inode. We only trust dir index if we verified that lower dir + * matches origin, otherwise dir index entries may be inconsistent + * and we ignore them. + * + * For regular upper, we already set origin if upper had ORIGIN + * xattr. There is no verification though as there is no path + * based dentry lookup in lower in this case. * - * For non-dir upper metacopy dentry, we already set "origin" if we - * verified that lower matched upper origin. If upper origin was - * not present (because lower layer did not support fh encode/decode), - * or indexing is not enabled, do not set "origin" and skip looking up - * index. This case should be handled in same way as a non-dir upper - * without ORIGIN is handled. + * For metacopy upper, we set a verified origin already if index + * is enabled and if upper had an ORIGIN xattr. * - * Always lookup index of non-dir non-metacopy and non-upper. */ - if (ctr && (!upperdentry || (!d.is_dir && !metacopy))) + if (!upperdentry && ctr) origin = stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && @@ -1074,6 +1090,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; + if (upperdentry && !uppermetacopy) + ovl_set_flag(OVL_UPPERDATA, inode); } ovl_dentry_update_reval(dentry, upperdentry, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e6f3670146ed1d..b725c7f15ff49b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -355,6 +355,9 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) { + if (fh_len < sizeof(struct ovl_fh)) + return -EINVAL; + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); } @@ -394,8 +397,8 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); -void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, - struct dentry *dentry, int level); +int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); /* inode.c */ @@ -421,7 +424,7 @@ struct ovl_inode_params { struct inode *newinode; struct dentry *upperdentry; struct ovl_path *lowerpath; - struct dentry *index; + bool index; unsigned int numlower; char *redirect; struct dentry *lowerdata; @@ -455,7 +458,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); struct ovl_cattr { dev_t rdev; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 5762d802fe0164..b429c80879ee00 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -46,7 +46,6 @@ struct ovl_path { /* private information held for overlayfs's superblock */ struct ovl_fs { - struct vfsmount *upper_mnt; unsigned int numlayer; /* Number of unique fs among layers including upper fs */ unsigned int numfs; @@ -68,8 +67,8 @@ struct ovl_fs { /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; + bool share_whiteout; /* Traps in ovl inode cache */ - struct inode *upperdir_trap; struct inode *workbasedir_trap; struct inode *workdir_trap; struct inode *indexdir_trap; @@ -77,8 +76,15 @@ struct ovl_fs { int xino_mode; /* For allocation of non-persistent inode numbers */ atomic_long_t last_ino; + /* Whiteout dentry cache */ + struct dentry *whiteout; }; +static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) +{ + return ofs->layers[0].mnt; +} + static inline struct ovl_fs *OVL_FS(struct super_block *sb) { return (struct ovl_fs *)sb->s_fs_info; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e452ff7d583d25..6918b98faeb62c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -297,7 +297,7 @@ static inline int ovl_dir_read(struct path *realpath, struct file *realfile; int err; - realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE); if (IS_ERR(realfile)) return PTR_ERR(realfile); @@ -743,8 +743,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_cache_entry *p; + const struct cred *old_cred; int err; + old_cred = ovl_override_creds(dentry->d_sb); if (!ctx->pos) ovl_dir_reset(file); @@ -758,17 +760,20 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) (ovl_same_fs(dentry->d_sb) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { - return ovl_iterate_real(file, ctx); + err = ovl_iterate_real(file, ctx); + } else { + err = iterate_dir(od->realfile, ctx); } - return iterate_dir(od->realfile, ctx); + goto out; } if (!od->cache) { struct ovl_dir_cache *cache; cache = ovl_cache_get(dentry); + err = PTR_ERR(cache); if (IS_ERR(cache)) - return PTR_ERR(cache); + goto out; od->cache = cache; ovl_seek_cursor(od, ctx->pos); @@ -780,7 +785,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) if (!p->ino) { err = ovl_cache_update_ino(&file->f_path, p); if (err) - return err; + goto out; } if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; @@ -788,7 +793,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) od->cursor = p->l_node.next; ctx->pos++; } - return 0; + err = 0; +out: + revert_creds(old_cred); + return err; } static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) @@ -831,6 +839,19 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) return res; } +static struct file *ovl_dir_open_realfile(struct file *file, + struct path *realpath) +{ + struct file *res; + const struct cred *old_cred; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); + revert_creds(old_cred); + + return res; +} + static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -853,7 +874,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct path upperpath; ovl_path_upper(dentry, &upperpath); - realfile = ovl_path_open(&upperpath, O_RDONLY); + realfile = ovl_dir_open_realfile(file, &upperpath); inode_lock(inode); if (!od->upperfile) { @@ -904,7 +925,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return -ENOMEM; type = ovl_path_real(file->f_path.dentry, &realpath); - realfile = ovl_path_open(&realpath, file->f_flags); + realfile = ovl_dir_open_realfile(file, &realpath); if (IS_ERR(realfile)) { kfree(od); return PTR_ERR(realfile); @@ -1071,14 +1092,13 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level) ovl_cache_free(&list); } -void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, +int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) { - ovl_cleanup(dir, dentry); - return; + return ovl_cleanup(dir, dentry); } err = ovl_do_rmdir(dir, dentry); @@ -1088,8 +1108,10 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, inode_unlock(dir); ovl_workdir_cleanup_recurse(&path, level + 1); inode_lock_nested(dir, I_MUTEX_PARENT); - ovl_cleanup(dir, dentry); + err = ovl_cleanup(dir, dentry); } + + return err; } int ovl_indexdir_cleanup(struct ovl_fs *ofs) @@ -1098,7 +1120,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) struct dentry *indexdir = ofs->indexdir; struct dentry *index = NULL; struct inode *dir = indexdir->d_inode; - struct path path = { .mnt = ofs->upper_mnt, .dentry = indexdir }; + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; @@ -1128,6 +1150,13 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) index = NULL; break; } + /* Cleanup leftover from index create/cleanup attempt */ + if (index->d_name.name[0] == '#') { + err = ovl_workdir_cleanup(dir, path.mnt, index, 1); + if (err) + break; + goto next; + } err = ovl_verify_index(ofs, index); if (!err) { goto next; @@ -1146,7 +1175,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ - err = ovl_cleanup_and_whiteout(indexdir, dir, index); + err = ovl_cleanup_and_whiteout(ofs, dir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(dir, index); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 732ad5495c9219..91476bc422f964 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,24 +211,28 @@ static void ovl_destroy_inode(struct inode *inode) static void ovl_free_fs(struct ovl_fs *ofs) { + struct vfsmount **mounts; unsigned i; iput(ofs->workbasedir_trap); iput(ofs->indexdir_trap); iput(ofs->workdir_trap); - iput(ofs->upperdir_trap); + dput(ofs->whiteout); dput(ofs->indexdir); dput(ofs->workdir); if (ofs->workdir_locked) ovl_inuse_unlock(ofs->workbasedir); dput(ofs->workbasedir); if (ofs->upperdir_locked) - ovl_inuse_unlock(ofs->upper_mnt->mnt_root); - mntput(ofs->upper_mnt); - for (i = 1; i < ofs->numlayer; i++) { + ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); + + /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->layers; + for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); - mntput(ofs->layers[i].mnt); + mounts[i] = ofs->layers[i].mnt; } + kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); for (i = 0; i < ofs->numfs; i++) free_anon_bdev(ofs->fs[i].pseudo_dev); @@ -257,12 +261,12 @@ static int ovl_sync_fs(struct super_block *sb, int wait) struct super_block *upper_sb; int ret; - if (!ofs->upper_mnt) + if (!ovl_upper_mnt(ofs)) return 0; /* - * If this is a sync(2) call or an emergency sync, all the super blocks - * will be iterated, including upper_sb, so no need to do anything. + * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). + * All the super blocks will be iterated, including upper_sb. * * If this is a syncfs(2) call, then we do need to call * sync_filesystem() on upper_sb, but enough if we do it when being @@ -271,7 +275,7 @@ static int ovl_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - upper_sb = ofs->upper_mnt->mnt_sb; + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; down_read(&upper_sb->s_umount); ret = sync_filesystem(upper_sb); @@ -309,7 +313,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) /* Will this overlay be forced to mount/remount ro? */ static bool ovl_force_readonly(struct ovl_fs *ofs) { - return (!ofs->upper_mnt || !ofs->workdir); + return (!ovl_upper_mnt(ofs) || !ofs->workdir); } static const char *ovl_redirect_mode_def(void) @@ -364,11 +368,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret = 0; if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) return -EROFS; - return 0; + if (*flags & SB_RDONLY && !sb_rdonly(sb)) { + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + } + + return ret; } static const struct super_operations ovl_super_operations = { @@ -470,6 +483,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) char *p; int err; bool metacopy_opt = false, redirect_opt = false; + bool nfs_export_opt = false, index_opt = false; config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); if (!config->redirect_mode) @@ -519,18 +533,22 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) case OPT_INDEX_ON: config->index = true; + index_opt = true; break; case OPT_INDEX_OFF: config->index = false; + index_opt = true; break; case OPT_NFS_EXPORT_ON: config->nfs_export = true; + nfs_export_opt = true; break; case OPT_NFS_EXPORT_OFF: config->nfs_export = false; + nfs_export_opt = true; break; case OPT_XINO_ON: @@ -552,6 +570,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) case OPT_METACOPY_OFF: config->metacopy = false; + metacopy_opt = true; break; default: @@ -601,6 +620,48 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) } } + /* Resolve nfs_export -> index dependency */ + if (config->nfs_export && !config->index) { + if (nfs_export_opt && index_opt) { + pr_err("conflicting options: nfs_export=on,index=off\n"); + return -EINVAL; + } + if (index_opt) { + /* + * There was an explicit index=off that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to index=off\n"); + config->nfs_export = false; + } else { + /* Automatically enable index otherwise. */ + config->index = true; + } + } + + /* Resolve nfs_export -> !metacopy dependency */ + if (config->nfs_export && config->metacopy) { + if (nfs_export_opt && metacopy_opt) { + pr_err("conflicting options: nfs_export=on,metacopy=on\n"); + return -EINVAL; + } + if (metacopy_opt) { + /* + * There was an explicit metacopy=on that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to metacopy=on\n"); + config->nfs_export = false; + } else { + /* + * There was an explicit nfs_export=on that resulted + * in this conflict. + */ + pr_info("disabling metacopy due to nfs_export=on\n"); + config->metacopy = false; + } + } + return 0; } @@ -611,15 +672,12 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, const char *name, bool persist) { struct inode *dir = ofs->workbasedir->d_inode; - struct vfsmount *mnt = ofs->upper_mnt; + struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *work; int err; bool retried = false; - bool locked = false; inode_lock_nested(dir, I_MUTEX_PARENT); - locked = true; - retry: work = lookup_one_len(name, ofs->workbasedir, strlen(name)); @@ -680,9 +738,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, goto out_err; } out_unlock: - if (locked) - inode_unlock(dir); - + inode_unlock(dir); return work; out_dput: @@ -779,11 +835,11 @@ static int ovl_lower_dir(const char *name, struct path *path, err = ovl_mount_dir_noesc(name, path); if (err) - goto out; + return err; err = ovl_check_namelen(path, ofs, name); if (err) - goto out_put; + return err; *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); @@ -805,11 +861,6 @@ static int ovl_lower_dir(const char *name, struct path *path, ofs->xino_mode = -1; return 0; - -out_put: - path_put_init(path); -out: - return err; } /* Workdir should not be subdir of upperdir and vice versa */ @@ -1016,7 +1067,7 @@ static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) } static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, - struct path *upperpath) + struct ovl_layer *upper_layer, struct path *upperpath) { struct vfsmount *upper_mnt; int err; @@ -1036,7 +1087,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, if (err) goto out; - err = ovl_setup_trap(sb, upperpath->dentry, &ofs->upperdir_trap, + err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap, "upperdir"); if (err) goto out; @@ -1050,9 +1101,23 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, /* Don't inherit atime flags */ upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); - ofs->upper_mnt = upper_mnt; + upper_layer->mnt = upper_mnt; + upper_layer->idx = 0; + upper_layer->fsid = 0; - if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) { + /* + * Inherit SB_NOSEC flag from upperdir. + * + * This optimization changes behavior when a security related attribute + * (suid/sgid/security.*) is changed on an underlying layer. This is + * okay because we don't yet have guarantees in that case, but it will + * need careful treatment once we want to honour changes to underlying + * filesystems. + */ + if (upper_mnt->mnt_sb->s_flags & SB_NOSEC) + sb->s_flags |= SB_NOSEC; + + if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) { ofs->upperdir_locked = true; } else { err = ovl_report_in_use(ofs, "upperdir"); @@ -1128,7 +1193,7 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, struct path *workpath) { - struct vfsmount *mnt = ofs->upper_mnt; + struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *temp; bool rename_whiteout; bool d_type; @@ -1272,7 +1337,7 @@ static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, struct ovl_entry *oe, struct path *upperpath) { - struct vfsmount *mnt = ofs->upper_mnt; + struct vfsmount *mnt = ovl_upper_mnt(ofs); int err; err = mnt_want_write(mnt); @@ -1328,7 +1393,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; - if (!ofs->config.nfs_export && !ofs->upper_mnt) + if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs)) return true; for (i = 0; i < ofs->numfs; i++) { @@ -1388,18 +1453,13 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) } static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, - struct path *stack, unsigned int numlower) + struct path *stack, unsigned int numlower, + struct ovl_layer *layers) { int err; unsigned int i; - struct ovl_layer *layers; err = -ENOMEM; - layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); - if (!layers) - goto out; - ofs->layers = layers; - ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL); if (ofs->fs == NULL) goto out; @@ -1407,11 +1467,6 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, /* idx/fsid 0 are reserved for upper fs even with lower only overlay */ ofs->numfs++; - layers[0].mnt = ofs->upper_mnt; - layers[0].idx = 0; - layers[0].fsid = 0; - ofs->numlayer = 1; - /* * All lower layers that share the same fs as upper layer, use the same * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower @@ -1424,8 +1479,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, goto out; } - if (ofs->upper_mnt) { - ofs->fs[0].sb = ofs->upper_mnt->mnt_sb; + if (ovl_upper_mnt(ofs)) { + ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb; ofs->fs[0].is_lower = false; } @@ -1480,7 +1535,7 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, * inode number or a non persistent inode number allocated from a * dedicated range. */ - if (ofs->numfs - !ofs->upper_mnt == 1) { + if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) { if (ofs->config.xino == OVL_XINO_ON) pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); ofs->xino_mode = 0; @@ -1509,44 +1564,30 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, } static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, - struct ovl_fs *ofs) + const char *lower, unsigned int numlower, + struct ovl_fs *ofs, struct ovl_layer *layers) { int err; - char *lowertmp, *lower; struct path *stack = NULL; - unsigned int stacklen, numlower = 0, i; + unsigned int i; struct ovl_entry *oe; - err = -ENOMEM; - lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL); - if (!lowertmp) - goto out_err; - - err = -EINVAL; - stacklen = ovl_split_lowerdirs(lowertmp); - if (stacklen > OVL_MAX_STACK) { - pr_err("too many lower directories, limit is %d\n", - OVL_MAX_STACK); - goto out_err; - } else if (!ofs->config.upperdir && stacklen == 1) { + if (!ofs->config.upperdir && numlower == 1) { pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); - goto out_err; + return ERR_PTR(-EINVAL); } else if (!ofs->config.upperdir && ofs->config.nfs_export && ofs->config.redirect_follow) { pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } - err = -ENOMEM; - stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); + stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL); if (!stack) - goto out_err; + return ERR_PTR(-ENOMEM); err = -EINVAL; - lower = lowertmp; - for (numlower = 0; numlower < stacklen; numlower++) { - err = ovl_lower_dir(lower, &stack[numlower], ofs, - &sb->s_stack_depth); + for (i = 0; i < numlower; i++) { + err = ovl_lower_dir(lower, &stack[i], ofs, &sb->s_stack_depth); if (err) goto out_err; @@ -1560,7 +1601,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, goto out_err; } - err = ovl_get_layers(sb, ofs, stack, numlower); + err = ovl_get_layers(sb, ofs, stack, numlower, layers); if (err) goto out_err; @@ -1578,7 +1619,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, for (i = 0; i < numlower; i++) path_put(&stack[i]); kfree(stack); - kfree(lowertmp); return oe; @@ -1629,8 +1669,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb, { int i, err; - if (ofs->upper_mnt) { - err = ovl_check_layer(sb, ofs, ofs->upper_mnt->mnt_root, + if (ovl_upper_mnt(ofs)) { + err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root, "upperdir"); if (err) return err; @@ -1702,7 +1742,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ofs; + struct ovl_layer *layers; struct cred *cred; + char *splitlower = NULL; + unsigned int numlower; int err; sb->s_d_op = &ovl_dentry_operations; @@ -1716,6 +1759,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!cred) goto out_err; + /* Is there a reason anyone would want not to share whiteouts? */ + ofs->share_whiteout = true; + ofs->config.index = ovl_index_def; ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); @@ -1731,6 +1777,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_err; } + err = -ENOMEM; + splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL); + if (!splitlower) + goto out_err; + + numlower = ovl_split_lowerdirs(splitlower); + if (numlower > OVL_MAX_STACK) { + pr_err("too many lower directories, limit is %d\n", + OVL_MAX_STACK); + goto out_err; + } + + layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); + if (!layers) + goto out_err; + + ofs->layers = layers; + /* Layer 0 is reserved for upper even if there's no upper */ + ofs->numlayer = 1; + sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; atomic_long_set(&ofs->last_ino, 1); @@ -1752,7 +1818,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_err; } - err = ovl_get_upper(sb, ofs, &upperpath); + err = ovl_get_upper(sb, ofs, &layers[0], &upperpath); if (err) goto out_err; @@ -1763,31 +1829,35 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->workdir) sb->s_flags |= SB_RDONLY; - sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth; - sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran; + sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth; + sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran; } - oe = ovl_get_lowerstack(sb, ofs); + oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers); err = PTR_ERR(oe); if (IS_ERR(oe)) goto out_err; /* If the upper fs is nonexistent, we mark overlayfs r/o too */ - if (!ofs->upper_mnt) + if (!ovl_upper_mnt(ofs)) sb->s_flags |= SB_RDONLY; if (!(ovl_force_readonly(ofs)) && ofs->config.index) { + /* index dir will act also as workdir */ + dput(ofs->workdir); + ofs->workdir = NULL; + iput(ofs->workdir_trap); + ofs->workdir_trap = NULL; + err = ovl_get_indexdir(sb, ofs, oe, &upperpath); if (err) goto out_free_oe; /* Force r/o mount with no index dir */ - if (!ofs->indexdir) { - dput(ofs->workdir); - ofs->workdir = NULL; + if (ofs->indexdir) + ofs->workdir = dget(ofs->indexdir); + else sb->s_flags |= SB_RDONLY; - } - } err = ovl_check_overlapping_layers(sb, ofs); @@ -1797,7 +1867,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* Show index=off in /proc/mounts for forced r/o mount */ if (!ofs->indexdir) { ofs->config.index = false; - if (ofs->upper_mnt && ofs->config.nfs_export) { + if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) { pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1818,6 +1888,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ovl_xattr_handlers; sb->s_fs_info = ofs; sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_SKIP_SYNC; err = -ENOMEM; root_dentry = ovl_get_root(sb, upperpath.dentry, oe); @@ -1825,6 +1896,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_free_oe; mntput(upperpath.mnt); + kfree(splitlower); sb->s_root = root_dentry; @@ -1834,6 +1906,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ovl_entry_stack_free(oe); kfree(oe); out_err: + kfree(splitlower); path_put(&upperpath); ovl_free_fs(ofs); out: diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 36b60788ee473f..56c1f89f20c9fd 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -18,13 +18,13 @@ int ovl_want_write(struct dentry *dentry) { struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - return mnt_want_write(ofs->upper_mnt); + return mnt_want_write(ovl_upper_mnt(ofs)); } void ovl_drop_write(struct dentry *dentry) { struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - mnt_drop_write(ofs->upper_mnt); + mnt_drop_write(ovl_upper_mnt(ofs)); } struct dentry *ovl_workdir(struct dentry *dentry) @@ -150,7 +150,7 @@ void ovl_path_upper(struct dentry *dentry, struct path *path) { struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - path->mnt = ofs->upper_mnt; + path->mnt = ovl_upper_mnt(ofs); path->dentry = ovl_dentry_upper(dentry); } @@ -459,7 +459,32 @@ bool ovl_is_whiteout(struct dentry *dentry) struct file *ovl_path_open(struct path *path, int flags) { - return dentry_open(path, flags | O_NOATIME, current_cred()); + struct inode *inode = d_inode(path->dentry); + int err, acc_mode; + + if (flags & ~(O_ACCMODE | O_LARGEFILE)) + BUG(); + + switch (flags & O_ACCMODE) { + case O_RDONLY: + acc_mode = MAY_READ; + break; + case O_WRONLY: + acc_mode = MAY_WRITE; + break; + default: + BUG(); + } + + err = inode_permission(inode, acc_mode | MAY_OPEN); + if (err) + return ERR_PTR(err); + + /* O_NOATIME is an optimization, don't fail if not permitted */ + if (inode_owner_or_capable(inode)) + flags |= O_NOATIME; + + return dentry_open(path, flags, current_cred()); } /* Caller should hold ovl_inode->lock */ @@ -707,7 +732,8 @@ static void ovl_cleanup_index(struct dentry *dentry) index = NULL; } else if (ovl_index_all(dentry->d_sb)) { /* Whiteout orphan index to block future open by handle */ - err = ovl_cleanup_and_whiteout(indexdir, dir, index); + err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb), + dir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(dir, index); diff --git a/fs/sync.c b/fs/sync.c index c6f6f5be5682a9..1373a610dc784e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -76,7 +76,8 @@ static void sync_inodes_one_sb(struct super_block *sb, void *arg) static void sync_fs_one_sb(struct super_block *sb, void *arg) { - if (!sb_rdonly(sb) && sb->s_op->sync_fs) + if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) && + sb->s_op->sync_fs) sb->s_op->sync_fs(sb, *(int *)arg); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b026329dbed73..19ef6c88c152e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1412,6 +1412,8 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ + /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 7edac8c7a9c1c9..de657bd211fa64 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -111,4 +111,6 @@ extern unsigned int sysctl_mount_max; extern bool path_is_mountpoint(const struct path *path); +extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); + #endif /* _LINUX_MOUNT_H */ diff --git a/security/security.c b/security/security.c index 545243fae4a641..e0290b7e6a082e 100644 --- a/security/security.c +++ b/security/security.c @@ -1464,6 +1464,7 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, 0, file, cmd, arg); } +EXPORT_SYMBOL_GPL(security_file_ioctl); static inline unsigned long mmap_prot(struct file *file, unsigned long prot) {