diff options
author | Andreas Gruenbacher <agruenba@redhat.com> | 2022-08-05 19:37:03 +0300 |
---|---|---|
committer | Andreas Gruenbacher <agruenba@redhat.com> | 2022-08-05 19:37:03 +0300 |
commit | 446279168e030fd0ed68e2bba336bef8bb3da352 (patch) | |
tree | 0944d4dc64b07f74a6cf73d2ce803a789f8d786d /fs | |
parent | 44dab005fd4298c209ea32ffda6131cad4358d21 (diff) | |
parent | 6feaec81477af0390a41470cbd6b353f68dd853c (diff) | |
download | linux-446279168e030fd0ed68e2bba336bef8bb3da352.tar.xz |
Merge part of branch 'for-next.instantiate' into for-next
Diffstat (limited to 'fs')
393 files changed, 12055 insertions, 9255 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 1c8dc696d516..cebba4eaa0b5 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) version = cpu_to_le32(v9inode->qid.version); path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->netfs_ctx.cache = + v9inode->netfs.cache = fscache_acquire_cookie(v9fs_session_cache(v9ses), 0, &path, sizeof(path), &version, sizeof(version), - i_size_read(&v9inode->vfs_inode)); + i_size_read(&v9inode->netfs.inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 79df61fe0e59..baf2b152229e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, const unsigned char **wnames, *uname; int i, n, l, clone, access; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *old_fid = NULL; + struct p9_fid *fid, *old_fid; v9ses = v9fs_dentry2v9ses(dentry); access = v9ses->flags & V9FS_ACCESS_MASK; @@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, if (IS_ERR(fid)) return fid; + refcount_inc(&fid->count); v9fs_fid_add(dentry->d_sb->s_root, fid); } /* If we are root ourself just return that */ - if (dentry->d_sb->s_root == dentry) { - refcount_inc(&fid->count); + if (dentry->d_sb->s_root == dentry) return fid; - } /* * Do a multipath walk with attached root. * When walking parent we need to make sure we @@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = ERR_PTR(n); goto err_out; } + old_fid = fid; clone = 1; i = 0; while (i < n) { @@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, * walk to ensure none of the patch component change */ fid = p9_client_walk(fid, l, &wnames[i], clone); + /* non-cloning walk will return the same fid */ + if (fid != old_fid) { + p9_client_clunk(old_fid); + old_fid = fid; + } if (IS_ERR(fid)) { - if (old_fid) { - /* - * If we fail, clunk fid which are mapping - * to path component and not the last component - * of the path. - */ - p9_client_clunk(old_fid); - } kfree(wnames); goto err_out; } - old_fid = fid; i += l; clone = 0; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e28ddf763b3b..0129de2ea31a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo) struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; memset(&v9inode->qid, 0, sizeof(v9inode->qid)); - inode_init_once(&v9inode->vfs_inode); + inode_init_once(&v9inode->netfs.inode); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index ec0e8df3b2eb..6acabc2e7dc9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,11 +109,7 @@ struct v9fs_session_info { #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; @@ -122,13 +118,13 @@ struct v9fs_inode { static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { - return container_of(inode, struct v9fs_inode, vfs_inode); + return container_of(inode, struct v9fs_inode, netfs.inode); } static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return netfs_i_cookie(&v9inode->vfs_inode); + return netfs_i_cookie(&v9inode->netfs); #else return NULL; #endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8ce82ff1e40a..d0833fa69faf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -58,21 +58,33 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { + struct inode *inode = file_inode(file); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; + BUG_ON(!fid); + + /* we might need to read from a fid that was opened write-only + * for read-modify-write of page cache, use the writeback fid + * for that */ + if (rreq->origin == NETFS_READ_FOR_WRITE && + (fid->mode & O_ACCMODE) == O_WRONLY) { + fid = v9inode->writeback_fid; + BUG_ON(!fid); + } + refcount_inc(&fid->count); rreq->netfs_priv = fid; return 0; } /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request - * @mapping: unused mapping of request to cleanup - * @priv: private data to cleanup, a fid, guaranted non-null. + * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq + * @rreq: The I/O request to clean up */ -static void v9fs_req_cleanup(struct address_space *mapping, void *priv) +static void v9fs_free_request(struct netfs_io_request *rreq) { - struct p9_fid *fid = priv; + struct p9_fid *fid = rreq->netfs_priv; p9_client_clunk(fid); } @@ -94,9 +106,9 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, + .free_request = v9fs_free_request, .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, - .cleanup = v9fs_req_cleanup, }; /** @@ -140,7 +152,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, transferred_or_error != -ENOBUFS) { version = cpu_to_le32(v9inode->qid.version); fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->vfs_inode), 0); + i_size_read(&v9inode->netfs.inode), 0); } } @@ -274,7 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata); + retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); if (retval < 0) return retval; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 55367ecb9442..3d8297714772 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); - return &v9inode->vfs_inode; + return &v9inode->netfs.inode; } /** @@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode) */ static void v9fs_set_netfs_context(struct inode *inode) { - netfs_i_context_init(inode, &v9fs_req_ops); + struct v9fs_inode *v9inode = V9FS_I(inode); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); } int v9fs_init_inode(struct v9fs_session_info *v9ses, @@ -1250,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); + if (!v9fs_proto_dotu(v9ses)) + return ERR_PTR(-EBADF); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); - if (!v9fs_proto_dotu(v9ses)) - return ERR_PTR(-EBADF); - st = p9_client_stat(fid); p9_client_clunk(fid); if (IS_ERR(st)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index d17502a738a9..b6eb1160296c 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_client_clunk(dfid); goto out; } @@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); + p9_client_clunk(dfid); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), @@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); + p9_client_clunk(dfid); goto error; } v9fs_invalidate_inode_attr(dir); diff --git a/fs/Kconfig b/fs/Kconfig index 30b751c7f11a..5976eb33535f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -245,19 +245,27 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -config HUGETLB_PAGE_FREE_VMEMMAP +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + bool + +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config MEMFD_CREATE diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 32dff7ba3dda..21e154516bf2 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || (SUPERH && !MMU)) + depends on ARM || ((M68K || SUPERH) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1b4d5809808d..a484fa642808 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); - unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false); + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } void afs_server_init_callback_work(struct work_struct *work) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 94aa7356248e..56ae5cd5184f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -109,7 +109,7 @@ struct afs_lookup_cookie { */ static void afs_dir_read_cleanup(struct afs_read *req) { - struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct address_space *mapping = req->vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; @@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, block = kmap_local_folio(folio, offset); if (block->hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", - __func__, dvnode->vfs_inode.i_ino, + __func__, dvnode->netfs.inode.i_ino, pos, offset, size, ntohs(block->hdr.magic)); trace_afs_dir_check_failed(dvnode, pos + offset, i_size); kunmap_local(block); @@ -183,7 +183,7 @@ error: static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { union afs_xdr_dir_block *block; - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; size_t offset, size; @@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) */ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; int ret = 0; @@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file) static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) __acquires(&dvnode->validate_lock) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct afs_read *req; loff_t i_size; int nr_pages, i; @@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) req->cleanup = afs_dir_read_cleanup; expand: - i_size = i_size_read(&dvnode->vfs_inode); + i_size = i_size_read(&dvnode->netfs.inode); if (i_size < 2048) { ret = afs_bad(dvnode, afs_file_error_dir_small); goto error; @@ -305,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; @@ -463,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, } /* skip if starts before the current position */ - if (offset < curr) + if (offset < curr) { + if (next > curr) + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, @@ -894,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, out_op: if (op->error == 0) { - inode = &op->file[1].vnode->vfs_inode; + inode = &op->file[1].vnode->netfs.inode; op->file[1].vnode = NULL; } @@ -1136,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1167,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - vnode->vfs_inode.i_generation); + vnode->netfs.inode.i_generation); goto not_found; } goto out_valid; @@ -1365,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -1484,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op) /* Already done */ } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { write_seqlock(&vnode->cb_lock); - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_unlink); } @@ -1501,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op) op->error = ret; } - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); } static void afs_unlink_success(struct afs_operation *op) @@ -1677,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op) afs_update_dentry_version(op, dvp, op->dentry); if (op->dentry_2->d_parent == op->dentry->d_parent) afs_update_dentry_version(op, dvp, op->dentry_2); - ihold(&vp->vnode->vfs_inode); - d_instantiate(op->dentry, &vp->vnode->vfs_inode); + ihold(&vp->vnode->netfs.inode); + d_instantiate(op->dentry, &vp->vnode->netfs.inode); } static void afs_link_put(struct afs_operation *op) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d98e109ecee9..0ab7752d1b75 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, */ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; folio = __filemap_get_folio(mapping, index, @@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -336,7 +336,7 @@ found_space: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] -= need_slots; - inode_inc_iversion_raw(&vnode->vfs_inode); + inode_inc_iversion_raw(&vnode->netfs.inode); afs_stat_v(vnode, n_dir_cr); _debug("Insert %s in %u[%u]", name->name, b, slot); @@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size < AFS_DIR_BLOCK_SIZE || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { @@ -463,7 +463,7 @@ found_dirent: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] += need_slots; - inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); afs_stat_v(vnode, n_dir_rm); _debug("Remove %s from %u[%u]", name->name, b, slot); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 45cfd50a9521..bb5807e87fa4 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, goto out; } while (!d_is_negative(sdentry)); - ihold(&vnode->vfs_inode); + ihold(&vnode->netfs.inode); ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key); switch (ret) { @@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, d_drop(sdentry); } - iput(&vnode->vfs_inode); + iput(&vnode->netfs.inode); dput(sdentry); out: _leave(" = %d", ret); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index f120bcb8bf73..d7d9402ff718 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_i_context_init(inode, NULL); + netfs_inode_init(&vnode->netfs, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index a8e8832179e4..42118a4f3383 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file) afs_put_wb_key(af->wb); if ((file->f_mode & FMODE_WRITE)) { - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); afs_set_cache_aux(vnode, &aux); fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); } else { @@ -325,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->iter = &fsreq->def_iter; iov_iter_xarray(&fsreq->def_iter, READ, - &fsreq->vnode->vfs_inode.i_mapping->i_pages, + &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); afs_fetch_data(fsreq->vnode, fsreq); @@ -382,17 +382,17 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; } -static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +static void afs_free_request(struct netfs_io_request *rreq) { - key_put(netfs_priv); + key_put(rreq->netfs_priv); } const struct netfs_request_ops afs_req_ops = { .init_request = afs_init_request, + .free_request = afs_free_request, .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, .issue_read = afs_issue_read, - .cleanup = afs_priv_cleanup, }; int afs_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index d222dfbe976b..7a3803ce3a22 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op) if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) - iput(&op->file[0].vnode->vfs_inode); + iput(&op->file[0].vnode->netfs.inode); if (op->file[1].put_vnode) - iput(&op->file[1].vnode->vfs_inode); + iput(&op->file[1].vnode->netfs.inode); if (op->more_files) { for (i = 0; i < op->nr_files - 2; i++) if (op->more_files[i].put_vnode) - iput(&op->more_files[i].vnode->vfs_inode); + iput(&op->more_files[i].vnode->netfs.inode); kfree(op->more_files); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 30b066299d39..64dab70d4a4f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops); } /* @@ -96,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); - set_nlink(&vnode->vfs_inode, status->nlink); + set_nlink(&vnode->netfs.inode, status->nlink); switch (status->type) { case AFS_FTYPE_FILE: @@ -139,7 +139,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; - inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver @@ -163,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; bool data_changed = false; @@ -246,7 +246,7 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ - vnode->netfs_ctx.remote_i_size = status->size; + vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); inode->i_ctime = t; @@ -289,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v */ if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } @@ -306,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v if (vp->scb.have_cb) afs_apply_callback(op, vp); } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_deleted); } @@ -326,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->vfs_inode.i_state & I_NEW) { + if (vnode->netfs.inode.i_state & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); op->error = ret; if (ret == 0) @@ -430,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct afs_vnode_cache_aux aux; if (vnode->status.type != AFS_FTYPE_FILE) { - vnode->netfs_ctx.cache = NULL; + vnode->netfs.cache = NULL; return; } @@ -457,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) { struct afs_vnode_param *dvp = &op->file[0]; - struct super_block *sb = dvp->vnode->vfs_inode.i_sb; + struct super_block *sb = dvp->vnode->netfs.inode.i_sb; struct afs_vnode *vnode; struct inode *inode; int ret; @@ -582,10 +582,10 @@ static void afs_zap_data(struct afs_vnode *vnode) /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a * directory or symlink */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - invalidate_remote_inode(&vnode->vfs_inode); + if (S_ISREG(vnode->netfs.inode.i_mode)) + invalidate_remote_inode(&vnode->netfs.inode); else - invalidate_inode_pages2(vnode->vfs_inode.i_mapping); + invalidate_inode_pages2(vnode->netfs.inode.i_mapping); } /* @@ -683,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) key_serial(key)); if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->vfs_inode.i_nlink) - clear_nlink(&vnode->vfs_inode); + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); goto valid; } @@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); - if (!(query_flags & AT_STATX_DONT_SYNC) && + if (vnode->volume && + !(query_flags & AT_STATX_DONT_SYNC) && !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) @@ -826,7 +827,7 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; loff_t old_i_size = i_size_read(inode); op->setattr.old_i_size = old_i_size; @@ -843,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; @@ -875,7 +876,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; loff_t i_size; int ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a30995901266..a6f25d9e75b5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -619,12 +619,7 @@ enum afs_lock_state { * leak from one inode to another. */ struct afs_vnode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; - + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ @@ -675,7 +670,7 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return netfs_i_cookie(&vnode->vfs_inode); + return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif @@ -685,7 +680,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE - vnode->netfs_ctx.cache = cookie; + vnode->netfs.cache = cookie; #endif } @@ -892,7 +887,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, - i_size_read(&vnode->vfs_inode), flags); + i_size_read(&vnode->netfs.inode), flags); } /* @@ -1217,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode) static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { - return afs_i2net(&vnode->vfs_inode); + return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) @@ -1593,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *); */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { - return container_of(inode, struct afs_vnode, vfs_inode); + return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { - return &vnode->vfs_inode; + return &vnode->netfs.inode; } /* @@ -1621,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op, */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; + i_size_write(&vnode->netfs.inode, size); + vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 1d1a8debe472..933e67fcdab1 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -163,8 +163,11 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) return; case -ECONNABORTED: + error = afs_abort_to_error(abort_code); + fallthrough; + case -ENETRESET: /* Responded, but we seem to have changed address */ e->responded = true; - e->error = afs_abort_to_error(abort_code); + e->error = error; return; } } diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 79e1a5f6701b..a840c3588ebb 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -292,6 +292,10 @@ bool afs_select_fileserver(struct afs_operation *op) op->error = error; goto iterate_address; + case -ENETRESET: + pr_warn("kAFS: Peer reset %s (op=%x)\n", + op->type ? op->type->name : "???", op->debug_id); + fallthrough; case -ECONNRESET: _debug("call reset"); op->error = error; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 23a1a92d64bb..a5434f3e57c6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -537,6 +537,8 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENODATA: case -EBADMSG: case -EMSGSIZE: + case -ENOMEM: + case -EFAULT: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; @@ -544,7 +546,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code, ret, "KUM"); goto local_abort; default: - abort_code = RX_USER_ABORT; + abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KER"); goto local_abort; @@ -836,7 +838,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, - RX_USER_ABORT, -ENOMEM, "KOO"); + RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); fallthrough; default: _leave(" [error]"); @@ -878,7 +880,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, - RX_USER_ABORT, -ENOMEM, "KOO"); + RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); } _leave(" [error]"); } diff --git a/fs/afs/security.c b/fs/afs/security.c index 3c7a8fc4f93f..7c6a63a30394 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -219,8 +219,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, * yet. */ size++; - new = kzalloc(sizeof(struct afs_permits) + - sizeof(struct afs_permit) * size, GFP_NOFS); + new = kzalloc(struct_size(new, permits, size), GFP_NOFS); if (!new) goto out_put; diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fea195b0b27..95d713074dc8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode) struct afs_vnode *vnode = _vnode; memset(vnode, 0, sizeof(*vnode)); - inode_init_once(&vnode->vfs_inode); + inode_init_once(&vnode->netfs.inode); mutex_init(&vnode->io_lock); init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); @@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb) init_rwsem(&vnode->rmdir_lock); INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); - _leave(" = %p", &vnode->vfs_inode); - return &vnode->vfs_inode; + _leave(" = %p", &vnode->netfs.inode); + return &vnode->netfs.inode; } static void afs_free_inode(struct inode *inode) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 94a3d247924b..cc665cef0abe 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -9,8 +9,7 @@ #include <linux/slab.h> #include "internal.h" -unsigned __read_mostly afs_volume_gc_delay = 10; -unsigned __read_mostly afs_volume_record_life = 60 * 60; +static unsigned __read_mostly afs_volume_record_life = 60 * 60; /* * Insert a volume into a cell. If there's an existing volume record, that is diff --git a/fs/afs/write.c b/fs/afs/write.c index 5224e346fbad..2c885b22de34 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata); + ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); if (ret < 0) return ret; @@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_end_pos = pos + copied; - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (write_end_pos > i_size) { write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (write_end_pos > i_size) afs_set_i_size(vnode, write_end_pos); write_sequnlock(&vnode->cb_lock); @@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t end; @@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = { static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, bool laundering) { - struct netfs_i_context *ictx = &vnode->netfs_ctx; struct afs_operation *op; struct afs_wb_key *wbk = NULL; loff_t size = iov_iter_count(iter); @@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, ictx->remote_i_size); + op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); op->store.laundering = laundering; - op->mtime = vnode->vfs_inode.i_mtime; + op->mtime = vnode->netfs.inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, struct iov_iter iter; unsigned long priv; unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->vfs_inode); + loff_t i_size = i_size_read(&vnode->netfs.inode); bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; @@ -636,6 +635,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: + case -ENETRESET: afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, ret); break; @@ -844,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) _enter("{%llx:%llu},{%zu},", vnode->fid.vid, vnode->fid.vnode, count); - if (IS_SWAPFILE(&vnode->vfs_inode)) { + if (IS_SWAPFILE(&vnode->netfs.inode)) { printk(KERN_INFO "AFS: Attempt to write to active swap file!\n"); return -EBUSY; @@ -957,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* Discard unused keys */ spin_lock(&vnode->wb_lock); - if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) && - !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) { + if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) && + !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) { list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) { if (refcount_read(&wbk->usage) == 1) list_move(&wbk->vnode_link, &graveyard); @@ -1033,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode, bool caching) { fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->vfs_inode.i_mapping, start, len, i_size, + vnode->netfs.inode.i_mapping, start, len, i_size, afs_write_to_cache_done, vnode, caching); } diff --git a/fs/attr.c b/fs/attr.c index 66899b6e9bd8..dbe996b0dedf 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, const struct inode *inode, kgid_t gid) { kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && - (in_group_p(gid) || gid_eq(gid, inode->i_gid))) - return true; + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { + kgid_t mapped_gid; + + if (gid_eq(gid, inode->i_gid)) + return true; + mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); + if (in_group_p(mapped_gid)) + return true; + } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; if (gid_eq(kgid, INVALID_GID) && @@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { + kgid_t mapped_gid; + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; + + if (ia_valid & ATTR_GID) + mapped_gid = mapped_kgid_fs(mnt_userns, + i_user_ns(inode), attr->ia_gid); + else + mapped_gid = i_gid_into_mnt(mnt_userns, inode); + /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - i_gid_into_mnt(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_p(mapped_gid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ac668ace50a..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -104,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e49b1a0c071..415bf1823fb3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1330,6 +1330,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89e94ea2fef5..4ba005c41983 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4632,6 +4632,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4672,8 +4683,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0867c5cd6e01..4157ecc27d4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f6b544ae616..04e36343da3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5241,13 +5241,14 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); end_write_bio(&epd, ret); return ret; } flush_write_bio(&epd); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fd827b99c1b..9dfde1af8a64 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); @@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3..05e0c4a5affd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -9897,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c39f8b3a5a4a..a3549d587464 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,6 +496,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); @@ -550,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b1fdc6a26c76..6627dd7875ee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -763,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -821,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -883,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -901,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -933,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1014,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -1030,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1044,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -1059,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1985,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + /* V1 cache is not supported for subpage mount. */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + ret = -EINVAL; + goto restore; + } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11237a913bee..79e8c8cd75ed 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2139,3 +2139,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) factor = div64_u64(used * 100, total); return factor >= fs_info->bg_reclaim_threshold; } + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index bb1a189e11f9..6b2eec99162b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) { return false; } + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7584aa6e5025..6dee88815491 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct iov_iter iter; ssize_t err = 0; size_t len; + int mode; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) goto out; /* We need to fetch the inline data. */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -392,11 +394,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) return 0; } -static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +static void ceph_netfs_free_request(struct netfs_io_request *rreq) { - struct inode *inode = mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - int got = (uintptr_t)priv; + struct ceph_inode_info *ci = ceph_inode(rreq->inode); + int got = (uintptr_t)rreq->netfs_priv; if (got) ceph_put_cap_refs(ci, got); @@ -404,12 +405,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, + .free_request = ceph_netfs_free_request, .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, - .cleanup = ceph_readahead_cleanup, }; #ifdef CONFIG_CEPH_FSCACHE @@ -604,8 +605,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); - if (IS_ERR(req)) + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); return PTR_ERR(req); + } set_page_writeback(page); if (caching) @@ -1318,10 +1321,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; int r; - r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); if (r == 0) folio_wait_fscache(folio); if (r < 0) { @@ -1644,7 +1648,7 @@ int ceph_uninline_data(struct file *file) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; @@ -1652,10 +1656,23 @@ int ceph_uninline_data(struct file *file) int err = 0; u64 len; + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == CEPH_INLINE_NONE) + return 0; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + folio = read_mapping_folio(inode->i_mapping, 0, file); if (IS_ERR(folio)) { err = PTR_ERR(folio); @@ -1664,17 +1681,6 @@ int ceph_uninline_data(struct file *file) folio_lock(folio); - spin_lock(&ci->i_ceph_lock); - inline_version = ci->i_inline_version; - spin_unlock(&ci->i_ceph_lock); - - dout("uninline_data %p %llx.%llx inline_version %llu\n", - inode, ceph_vinop(inode), inline_version); - - if (inline_version == 1 || /* initial version, no data */ - inline_version == CEPH_INLINE_NONE) - goto out_unlock; - len = i_size_read(inode); if (len > folio_size(folio)) len = folio_size(folio); @@ -1739,6 +1745,7 @@ int ceph_uninline_data(struct file *file) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); +out_uninline: if (!err) { int dirty; @@ -1757,8 +1764,10 @@ out_put_req: if (err == -ECANCELED) err = 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } out: ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", @@ -1777,7 +1786,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) if (!mapping->a_ops->read_folio) return -ENOEXEC; - file_accessed(file); vma->vm_ops = &ceph_vmops; return 0; } @@ -1790,7 +1798,7 @@ enum { static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; @@ -1905,7 +1913,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_mtime = ci->netfs.inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index ddea99922073..177d8e8d73fe 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!(inode->i_state & I_NEW)) return; - WARN_ON_ONCE(ci->netfs_ctx.cache); + WARN_ON_ONCE(ci->netfs.cache); - ci->netfs_ctx.cache = + ci->netfs.cache = fscache_acquire_cookie(fsc->fscache, 0, &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 7255b790a4c1..dc502daac49a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write); static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - return netfs_i_cookie(&ci->vfs_inode); + return netfs_i_cookie(&ci->netfs); } static inline void ceph_fscache_resize(struct inode *inode, loff_t to) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5c14ef04e474..38c930384d41 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode, ci->i_hold_caps_max - jiffies); } @@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); @@ -531,7 +531,7 @@ no_change: static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) @@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + dout("__cap_delay_cancel %p\n", &ci->netfs.inode); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); @@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; @@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); + if (S_ISDIR(ci->netfs.inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->netfs.inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ - if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); @@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap) if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " - "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode, cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } @@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (!__cap_is_valid(cap)) continue; dout("__ceph_caps_issued %p cap %p issued %s\n", - &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + &ci->netfs.inode, cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; @@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap) spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { - dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", - &cap->ci->vfs_inode, cap, s->s_mds); + &cap->ci->netfs.inode, cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } @@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) continue; if ((cap->issued & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, + " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) have |= cap->issued; if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); @@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int ret; spin_lock(&ci->i_ceph_lock); @@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci) if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || - (S_ISREG(ci->vfs_inode.i_mode) && - ci->vfs_inode.i_data.nrpages)) + (S_ISREG(ci->netfs.inode.i_mode) && + ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ @@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; @@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); - mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); @@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - ceph_change_snap_realm(&ci->vfs_inode, NULL); + ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } @@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - fsc = ceph_inode_to_client(&ci->vfs_inode); + fsc = ceph_inode_to_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && - !ceph_inode_is_shutdown(&ci->vfs_inode)); + !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } @@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); @@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); if (!msg) { @@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; @@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; - struct ceph_cap_flush *cf; + struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { @@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, } ret = -ENOENT; - list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { - if (cf->tid >= first_tid) { + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { + if (iter->tid >= first_tid) { + cf = iter; ret = 0; break; } @@ -1621,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; int mds; @@ -1681,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct inode *inode = &ci->vfs_inode; + ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1695,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, return 0; } - dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; @@ -1711,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ci->i_snap_realm->cached_context); } dout(" inode %p now dirty snapc %p auth cap %p\n", - &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); @@ -1874,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = i_size_read(&ci->vfs_inode); + loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -1899,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; @@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; + bool queue_writeback = false; if (session) ceph_get_mds_session(session); @@ -2062,10 +2064,27 @@ retry: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & cap_used) == 0) { - dout("completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; + if (revoking) { + if ((revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; } /* want more caps from mds? */ @@ -2135,6 +2154,8 @@ ack: spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); + if (queue_writeback) + ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } @@ -2218,9 +2239,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) } /* - * wait for any unsafe requests to complete. + * flush the mdlog and wait for any unsafe requests to complete. */ -static int unsafe_request_wait(struct inode *inode) +static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -2336,7 +2357,7 @@ retry: kfree(sessions); } - dout("unsafe_request_wait %p wait on tid %llu %llu\n", + dout("%s %p wait on tid %llu %llu\n", __func__, inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, @@ -2380,7 +2401,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - err = unsafe_request_wait(inode); + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds @@ -2446,7 +2467,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; @@ -2539,7 +2560,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2589,7 +2610,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2609,7 +2630,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, lockdep_assert_held(&ci->i_ceph_lock); - dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, + dout("%s %p flushing %s\n", __func__, &ci->netfs.inode, ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { @@ -2652,10 +2673,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) - ihold(&ci->vfs_inode); + ihold(&ci->netfs.inode); ci->i_wb_ref++; dout("%s %p wb %d -> %d (?)\n", __func__, - &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2983,7 +3004,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return ret; } - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { @@ -3073,7 +3094,7 @@ enum put_cap_refs_mode { static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; @@ -3181,11 +3202,10 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap = NULL; + struct inode *inode = &ci->netfs.inode; + struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; - bool found = false; bool flush_snaps = false; bool complete_capsnap = false; @@ -3212,14 +3232,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->context == snapc) { - found = true; + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->context == snapc) { + capsnap = iter; break; } } - if (!found) { + if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. @@ -3678,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, session->s_mds, &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, - i_flushing_item)->vfs_inode); + i_flushing_item)->netfs.inode); } } mdsc->num_cap_flushing--; @@ -3769,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); - struct ceph_cap_snap *capsnap; - bool flushed = false; + struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; @@ -3778,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, inode, ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->follows == follows) { - if (capsnap->cap_flush.tid != flush_tid) { + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->follows == follows) { + if (iter->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", capsnap, follows, - flush_tid, capsnap->cap_flush.tid); + " %lld\n", iter, follows, + flush_tid, iter->cap_flush.tid); break; } - flushed = true; + capsnap = iter; break; } else { dout(" skipping cap_snap %p follows %lld\n", - capsnap, capsnap->follows); + iter, iter->follows); } } - if (flushed) + if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); - if (flushed) { + if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) @@ -4326,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) break; list_del_init(&ci->i_cap_delay_list); - inode = igrab(&ci->vfs_inode); + inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); @@ -4354,7 +4373,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); @@ -4388,7 +4407,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci, void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; @@ -4422,7 +4441,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; @@ -4637,7 +4656,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali lockdep_assert_held(&ci->i_ceph_lock); dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c8226c0feac..da59e836a06e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -205,7 +205,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; struct ceph_file_info *fi; int ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 63113e2a4890..56c53ab3618e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -176,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_insert_color(&frag->node, &ci->i_fragtree); dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->vfs_inode), f); + ceph_vinop(&ci->netfs.inode), f); return frag; } @@ -457,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->vfs_inode); + dout("alloc_inode %p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops); spin_lock_init(&ci->i_ceph_lock); @@ -547,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - return &ci->vfs_inode; + return &ci->netfs.inode; } void ceph_free_inode(struct inode *inode) @@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode) __ceph_remove_caps(ci); - if (__ceph_has_any_quota(ci)) + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) ceph_adjust_quota_realms_count(inode, false); /* @@ -1466,10 +1466,12 @@ retry_lookup: } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); + } + + if (!d_unhashed(dn) && have_lease) update_dentry_lease(dir, dn, rinfo->dlease, session, req->r_request_started); - } goto done; } @@ -1884,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); @@ -1977,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work) { struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_work); - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { dout("writeback %p\n", inode); @@ -2258,6 +2259,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & CEPH_STAT_RSTAT)) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. @@ -2281,7 +2306,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + mode = ceph_try_to_choose_auth_mds(inode, mask); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); @@ -2423,7 +2448,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return -ESTALE; /* Skip the getattr altogether if we're asked not to sync */ - if (!(flags & AT_STATX_DONT_SYNC)) { + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { err = ceph_do_getattr(inode, statx_to_caps(request_mask, inode->i_mode), flags & AT_STATX_FORCE_SYNC); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 00c3de177dd6..33f517d549ce 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_32_safe(p, end, sets, bad); dout("got %u sets of delegated inodes\n", sets); while (sets--) { - u64 start, len, ino; + u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); @@ -449,7 +449,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, continue; } while (len--) { - int err = xa_insert(&s->s_delegated_inos, ino = start++, + int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { @@ -1564,7 +1564,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, p = session->s_caps.next; while (p != &session->s_caps) { cap = list_entry(p, struct ceph_cap, session_caps); - inode = igrab(&cap->ci->vfs_inode); + inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; @@ -1622,7 +1622,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, int iputs; dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); spin_unlock(&ci->i_ceph_lock); @@ -2651,7 +2651,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; - int flags = 0; + int flags = 0, max_retry; + + /* + * The type of 'r_attempts' in kernel 'ceph_mds_request' + * is 'int', while in 'ceph_mds_request_head' the type of + * 'num_retry' is '__u8'. So in case the request retries + * exceeding 256 times, the MDS will receive a incorrect + * retry seq. + * + * In this case it's ususally a bug in MDS and continue + * retrying the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); + max_retry = 1 << (max_retry * BITS_PER_BYTE); + if (req->r_attempts >= max_retry) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } req->r_attempts++; if (req->r_inode) { @@ -2663,7 +2684,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, else req->r_sent_on_mseq = -1; } - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { @@ -3265,6 +3286,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; + bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); @@ -3273,16 +3295,41 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { + mutex_unlock(&mdsc->mutex); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); - goto out; /* dup reply? */ + return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); + /* + * The type of 'num_fwd' in ceph 'MClientRequestForward' + * is 'int32_t', while in 'ceph_mds_request_head' the + * type is '__u8'. So in case the request bounces between + * MDSes exceeding 256 times, the client will get stuck. + * + * In this case it's ususally a bug in MDS and continue + * bouncing the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + int max = sizeof_field(struct ceph_mds_request_head, num_fwd); + max = 1 << (max * BITS_PER_BYTE); + if (req->r_num_fwd >= max) { + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", + tid); + } else { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -3294,9 +3341,12 @@ static void handle_forward(struct ceph_mds_client *mdsc, put_request_session(req); __do_request(mdsc, req); } - ceph_mdsc_put_request(req); -out: mutex_unlock(&mdsc->mutex); + + /* kick calling process */ + if (aborted) + complete_request(mdsc, req); + ceph_mdsc_put_request(req); return; bad: @@ -3375,13 +3425,17 @@ static void handle_session(struct ceph_mds_session *session, } if (msg_version >= 5) { - u32 flags; - /* version >= 4, struct_v, struct_cv, len, metric_spec */ - ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); + u32 flags, len; + + /* version >= 4 */ + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ + ceph_decode_32_safe(&p, end, len, bad); /* len */ + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ + /* version >= 5, flags */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { - pr_warn("mds%d session blocklisted\n", session->s_mds); + pr_warn("mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } @@ -4396,12 +4450,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); - /* - * if this is a preemptive lease RELEASE, no need to - * flush request stream, since the actual request will - * soon follow. - */ - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); ceph_con_send(&session->s_con, msg); } @@ -4696,15 +4744,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) } /* - * wait for all write mds requests to flush. + * flush the mdlog and wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, + u64 want_tid) { struct ceph_mds_request *req = NULL, *nextreq; + struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("wait_unsafe_requests want %lld\n", want_tid); + dout("%s want %lld\n", __func__, want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -4716,14 +4766,32 @@ restart: nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_mds_session *s = req->r_session; + + if (!s) { + req = nextreq; + continue; + } + /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); + s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", + + /* send flush mdlog request to MDS */ + if (last_session != s) { + send_flush_mdlog(s); + ceph_put_mds_session(last_session); + last_session = s; + } else { + ceph_put_mds_session(s); + } + dout("%s wait on %llu (want %llu)\n", __func__, req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) @@ -4738,7 +4806,8 @@ restart: req = nextreq; } mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests done\n"); + ceph_put_mds_session(last_session); + dout("%s done\n", __func__); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) @@ -4767,7 +4836,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); - wait_unsafe_requests(mdsc, want_tid); + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 33497846e47e..1140aecd82ce 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -579,7 +579,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, - TASK_INTERRUPTIBLE); + TASK_KILLABLE); } extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index a338a3ec0dc4..64592adfe48f 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) /* * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (either max_files - * or max_bytes). If the root is reached, return the root ceph_snap_realm - * instead. + * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * max_bytes, or any, depending on the 'which_quota' argument). If the root is + * reached, return the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, bool retry) + struct inode *inode, + enum quota_get_realm which_quota, + bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -248,7 +250,7 @@ restart: } ci = ceph_inode(in); - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, which_quota); iput(in); next = realm->parent; @@ -279,8 +281,8 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, true); - new_realm = get_quota_realm(mdsc, new, false); + old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); + new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); if (PTR_ERR(new_realm) == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) @@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), + QUOTA_GET_MAX_BYTES, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 322ee5add942..864cdaa0d2bd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o, static void ceph_queue_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap **pcapsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_snap_context *old_snapc, *new_snapc; struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; @@ -652,7 +652,7 @@ update_snapc: int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); @@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); + struct inode *inode = igrab(&ci->netfs.inode); if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); @@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e6987d295079..40140805bdcf 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -876,7 +876,7 @@ mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); + inode_init_once(&ci->netfs.inode); } static int __init init_caches(void) @@ -1119,6 +1119,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; + s->s_flags |= SB_NODIRATIME | SB_NOATIME; ret = set_anon_super_fc(s, fc); if (ret != 0) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20ceab74e871..f59dac66955b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -316,11 +316,7 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { - struct { - /* These must be contiguous */ - struct inode vfs_inode; - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; @@ -436,7 +432,7 @@ struct ceph_inode_info { static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { - return container_of(inode, struct ceph_inode_info, vfs_inode); + return container_of(inode, struct ceph_inode_info, netfs.inode); } static inline struct ceph_fs_client * @@ -1022,6 +1018,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode) ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } +extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) @@ -1278,9 +1275,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ -static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) + +enum quota_get_realm { + QUOTA_GET_MAX_FILES, + QUOTA_GET_MAX_BYTES, + QUOTA_GET_ANY +}; + +static inline bool __ceph_has_quota(struct ceph_inode_info *ci, + enum quota_get_realm which) { - return ci->i_max_files || ci->i_max_bytes; + bool has_quota = false; + + switch (which) { + case QUOTA_GET_MAX_BYTES: + has_quota = !!ci->i_max_bytes; + break; + case QUOTA_GET_MAX_FILES: + has_quota = !!ci->i_max_files; + break; + default: + has_quota = !!(ci->i_max_files || ci->i_max_bytes); + } + return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); @@ -1289,13 +1306,13 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; - had_quota = __ceph_has_any_quota(ci); + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) - ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); + ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } extern void ceph_handle_quota(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index afec84088471..f141f5246163 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_string *pool_ns; s64 pool = ci->i_layout.pool_id; @@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); - dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { @@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, char *val, size_t size) { ssize_t ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ci->i_layout.pool_id; const char *pool_name; @@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); } @@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "client%lld", ceph_client_gid(fsc->client)); @@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, } #define XATTR_RSTAT_FIELD(_type, _name) \ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ + } #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ @@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rsubdirs), XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), - XATTR_RSTAT_FIELD(dir, rctime), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), { .name = "ceph.dir.pin", .name_size = sizeof("ceph.dir.pin"), @@ -621,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci, } dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); return 0; } @@ -863,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) struct ceph_buffer *old_blob = NULL; void *dest; - dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + dout("__build_xattrs_blob %p\n", &ci->netfs.inode); if (ci->i_xattrs.dirty) { int need = __get_required_blob_size(ci, 0, 0); diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index cc8fdcb35b71..8c9f2c00be72 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \ cifs_unicode.o nterr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ + readdir.o ioctl.o sess.o export.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o @@ -30,3 +30,5 @@ cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o + +cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9d334816eac0..2cfbac8bb965 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -116,7 +116,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) tcon->ses->server->ops->dump_share_caps(m, tcon); if (tcon->use_witness) seq_puts(m, " Witness"); - + if (tcon->broken_sparse_sup) + seq_puts(m, " nosparse"); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_putc(m, '\n'); @@ -161,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); else if (iface->sockaddr.ss_family == AF_INET6) seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); + if (!iface->is_active) + seq_puts(m, "\t\t[for-cleanup]\n"); } static int cifs_debug_files_proc_show(struct seq_file *m, void *v) @@ -220,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; + struct cifs_server_iface *iface; int c, i, j; seq_puts(m, @@ -386,7 +390,7 @@ skip_rdma: (ses->serverNOS == NULL)) { seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ", i, ses->ip_addr, ses->ses_count, - ses->capabilities, ses->status); + ses->capabilities, ses->ses_status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) seq_printf(m, "Guest "); else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) @@ -398,7 +402,7 @@ skip_rdma: "\n\tSMB session status: %d ", i, ses->ip_addr, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, - ses->capabilities, ses->status); + ses->capabilities, ses->ses_status); } seq_printf(m, "\n\tSecurity type: %s ", @@ -418,6 +422,8 @@ skip_rdma: spin_lock(&ses->chan_lock); if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0)) seq_puts(m, "\tPrimary channel: DISCONNECTED "); + if (CIFS_CHAN_IN_RECONNECT(ses, 0)) + seq_puts(m, "\t[RECONNECTING] "); if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu ", @@ -426,6 +432,8 @@ skip_rdma: cifs_dump_channel(m, j, &ses->chans[j]); if (CIFS_CHAN_NEEDS_RECONNECT(ses, j)) seq_puts(m, "\tDISCONNECTED "); + if (CIFS_CHAN_IN_RECONNECT(ses, j)) + seq_puts(m, "\t[RECONNECTING] "); } } spin_unlock(&ses->chan_lock); @@ -451,11 +459,10 @@ skip_rdma: if (ses->iface_count) seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); - for (j = 0; j < ses->iface_count; j++) { - struct cifs_server_iface *iface; - - iface = &ses->iface_list[j]; - seq_printf(m, "\n\t%d)", j+1); + j = 0; + list_for_each_entry(iface, &ses->iface_list, + iface_head) { + seq_printf(m, "\n\t%d)", ++j); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 180c234c2f46..1e4c7cc5287f 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -465,7 +465,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a int ret = 0; /* Store the reconnect address */ - mutex_lock(&tcon->ses->server->srv_mutex); + cifs_server_lock(tcon->ses->server); if (cifs_sockaddr_equal(&tcon->ses->server->dstaddr, addr)) goto unlock; @@ -501,7 +501,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a cifs_signal_cifsd_for_reconnect(tcon->ses->server, false); unlock: - mutex_unlock(&tcon->ses->server->srv_mutex); + cifs_server_unlock(tcon->ses->server); return ret; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 0912d8bbbac1..663cb9db4908 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -236,9 +236,9 @@ int cifs_verify_signature(struct smb_rqst *rqst, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc) return rc; @@ -626,7 +626,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) memcpy(ses->auth_key.response + baselen, tiblob, tilen); - mutex_lock(&ses->server->srv_mutex); + cifs_server_lock(ses->server); rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5, @@ -678,7 +678,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); unlock: - mutex_unlock(&ses->server->srv_mutex); + cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: kfree(tiblob); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2b1a1c029c75..8f2e003e0590 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -377,7 +377,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->flags = 0; spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; - cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ + cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; @@ -389,12 +389,12 @@ cifs_alloc_inode(struct super_block *sb) * Can not set i_flags here - they get immediately overwritten to zero * by the VFS. */ - /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ + /* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); INIT_LIST_HEAD(&cifs_inode->llist); INIT_LIST_HEAD(&cifs_inode->deferred_closes); spin_lock_init(&cifs_inode->deferred_lock); - return &cifs_inode->vfs_inode; + return &cifs_inode->netfs.inode; } static void @@ -582,6 +582,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nocase"); if (tcon->nodelete) seq_puts(s, ",nodelete"); + if (cifs_sb->ctx->no_sparse) + seq_puts(s, ",nosparse"); if (tcon->local_lease) seq_puts(s, ",locallease"); if (tcon->retry) @@ -836,7 +838,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, int flags, struct smb3_fs_context *old_ctx) { int rc; - struct super_block *sb; + struct super_block *sb = NULL; struct cifs_sb_info *cifs_sb = NULL; struct cifs_mnt_data mnt_data; struct dentry *root; @@ -932,9 +934,11 @@ out_super: return root; out: if (cifs_sb) { - kfree(cifs_sb->prepath); - smb3_cleanup_fs_context(cifs_sb->ctx); - kfree(cifs_sb); + if (!sb || IS_ERR(sb)) { /* otherwise kill_sb will handle */ + kfree(cifs_sb->prepath); + smb3_cleanup_fs_context(cifs_sb->ctx); + kfree(cifs_sb); + } } return root; } @@ -1082,7 +1086,7 @@ struct file_system_type cifs_fs_type = { }; MODULE_ALIAS_FS("cifs"); -static struct file_system_type smb3_fs_type = { +struct file_system_type smb3_fs_type = { .owner = THIS_MODULE, .name = "smb3", .init_fs_context = smb3_init_fs_context, @@ -1414,7 +1418,7 @@ cifs_init_once(void *inode) { struct cifsInodeInfo *cifsi = inode; - inode_init_once(&cifsi->vfs_inode); + inode_init_once(&cifsi->netfs.inode); init_rwsem(&cifsi->lock_sem); } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c0542bdcd06b..b17be47a8e59 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry) return (unsigned long) dentry->d_fsdata; } -extern struct file_system_type cifs_fs_type; +extern struct file_system_type cifs_fs_type, smb3_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; @@ -152,6 +152,7 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define SMB3_PRODUCT_BUILD 35 -#define CIFS_VERSION "2.36" +/* when changing internal version - update following two lines at same time */ +#define SMB3_PRODUCT_BUILD 37 +#define CIFS_VERSION "2.37" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8de977c359b1..a643c84ff1e9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,6 +16,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/utsname.h> +#include <linux/sched/mm.h> #include <linux/netfs.h> #include "cifs_fs_sb.h" #include "cifsacl.h" @@ -79,6 +80,9 @@ #define SMB_DNS_RESOLVE_INTERVAL_MIN 120 #define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 +/* smb multichannel query server interfaces interval in seconds */ +#define SMB_INTERFACE_POLL_INTERVAL 600 + /* maximum number of PDUs in one compound */ #define MAX_COMPOUND 5 @@ -106,7 +110,7 @@ * CIFS vfs client Status information (based on what we know.) */ -/* associated with each tcp and smb session */ +/* associated with each connection */ enum statusEnum { CifsNew = 0, CifsGood, @@ -114,8 +118,15 @@ enum statusEnum { CifsNeedReconnect, CifsNeedNegotiate, CifsInNegotiate, - CifsNeedSessSetup, - CifsInSessSetup, +}; + +/* associated with each smb session */ +enum ses_status_enum { + SES_NEW = 0, + SES_GOOD, + SES_EXITING, + SES_NEED_RECON, + SES_IN_SETUP }; /* associated with each tree connection to the server */ @@ -621,7 +632,8 @@ struct TCP_Server_Info { unsigned int in_flight; /* number of requests on the wire to server */ unsigned int max_in_flight; /* max number of requests that were on wire */ spinlock_t req_lock; /* protect the two values above */ - struct mutex srv_mutex; + struct mutex _srv_mutex; + unsigned int nofs_flag; struct task_struct *tsk; char server_GUID[16]; __u16 sec_mode; @@ -736,6 +748,22 @@ struct TCP_Server_Info { #endif }; +static inline void cifs_server_lock(struct TCP_Server_Info *server) +{ + unsigned int nofs_flag = memalloc_nofs_save(); + + mutex_lock(&server->_srv_mutex); + server->nofs_flag = nofs_flag; +} + +static inline void cifs_server_unlock(struct TCP_Server_Info *server) +{ + unsigned int nofs_flag = server->nofs_flag; + + mutex_unlock(&server->_srv_mutex); + memalloc_nofs_restore(nofs_flag); +} + struct cifs_credits { unsigned int value; unsigned int instance; @@ -908,14 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif struct cifs_server_iface { + struct list_head iface_head; + struct kref refcount; size_t speed; unsigned int rdma_capable : 1; unsigned int rss_capable : 1; + unsigned int is_active : 1; /* unset if non existent */ struct sockaddr_storage sockaddr; }; +/* release iface when last ref is dropped */ +static inline void +release_iface(struct kref *ref) +{ + struct cifs_server_iface *iface = container_of(ref, + struct cifs_server_iface, + refcount); + list_del_init(&iface->iface_head); + kfree(iface); +} + +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a has higher link speed, or rdma capable, or rss capable + * return -1 otherwise. + */ +static inline int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ + int cmp_ret = 0; + + WARN_ON(!a || !b); + if (a->speed == b->speed) { + if (a->rdma_capable == b->rdma_capable) { + if (a->rss_capable == b->rss_capable) { + cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, + sizeof(a->sockaddr)); + if (!cmp_ret) + return 0; + else if (cmp_ret > 0) + return 1; + else + return -1; + } else if (a->rss_capable > b->rss_capable) + return 1; + else + return -1; + } else if (a->rdma_capable > b->rdma_capable) + return 1; + else + return -1; + } else if (a->speed > b->speed) + return 1; + else + return -1; +} + struct cifs_chan { + unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; + struct cifs_server_iface *iface; /* interface in use */ __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; @@ -930,7 +1011,7 @@ struct cifs_ses { struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ - enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */ + enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */ unsigned overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -944,7 +1025,7 @@ struct cifs_ses { and after mount option parsing we fill it */ char *domainName; char *password; - char *workstation_name; + char workstation_name[CIFS_MAX_WORKSTATION_LEN]; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ enum securityEnum sectype; /* what security flavor was specified? */ @@ -967,7 +1048,7 @@ struct cifs_ses { */ spinlock_t iface_lock; /* ========= begin: protected by iface_lock ======== */ - struct cifs_server_iface *iface_list; + struct list_head iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ /* ========= end: protected by iface_lock ======== */ @@ -977,12 +1058,16 @@ struct cifs_ses { #define CIFS_MAX_CHANNELS 16 #define CIFS_ALL_CHANNELS_SET(ses) \ ((1UL << (ses)->chan_count) - 1) +#define CIFS_ALL_CHANS_GOOD(ses) \ + (!(ses)->chans_need_reconnect) #define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \ ((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses)) #define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \ ((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses)) #define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \ test_bit((index), &(ses)->chans_need_reconnect) +#define CIFS_CHAN_IN_RECONNECT(ses, index) \ + ((ses)->chans[(index)].in_reconnect) struct cifs_chan chans[CIFS_MAX_CHANNELS]; size_t chan_count; @@ -1009,6 +1094,58 @@ cap_unix(struct cifs_ses *ses) return ses->server->vals->cap_unix & ses->capabilities; } +/* + * common struct for holding inode info when searching for or updating an + * inode with new info + */ + +#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_DELETE_PENDING 0x2 +#define CIFS_FATTR_NEED_REVAL 0x4 +#define CIFS_FATTR_INO_COLLISION 0x8 +#define CIFS_FATTR_UNKNOWN_NLINK 0x10 +#define CIFS_FATTR_FAKE_ROOT_INO 0x20 + +struct cifs_fattr { + u32 cf_flags; + u32 cf_cifsattrs; + u64 cf_uniqueid; + u64 cf_eof; + u64 cf_bytes; + u64 cf_createtime; + kuid_t cf_uid; + kgid_t cf_gid; + umode_t cf_mode; + dev_t cf_rdev; + unsigned int cf_nlink; + unsigned int cf_dtype; + struct timespec64 cf_atime; + struct timespec64 cf_mtime; + struct timespec64 cf_ctime; + u32 cf_cifstag; +}; + +struct cached_dirent { + struct list_head entry; + char *name; + int namelen; + loff_t pos; + + struct cifs_fattr fattr; +}; + +struct cached_dirents { + bool is_valid:1; + bool is_failed:1; + struct dir_context *ctx; /* + * Only used to make sure we only take entries + * from a single context. Never dereferenced. + */ + struct mutex de_mutex; + int pos; /* Expected ctx->pos */ + struct list_head entries; +}; + struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; @@ -1021,6 +1158,7 @@ struct cached_fid { struct dentry *dentry; struct work_struct lease_break; struct smb2_file_all_info file_all_info; + struct cached_dirents dirents; }; /* @@ -1120,6 +1258,7 @@ struct cifs_tcon { #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ #endif + struct delayed_work query_interfaces; /* query interfaces workqueue job */ }; /* @@ -1396,20 +1535,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) #define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) +#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) #define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) +#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) /* * One of these for each file inode */ struct cifsInodeInfo { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ /* @@ -1448,7 +1583,7 @@ struct cifsInodeInfo { static inline struct cifsInodeInfo * CIFS_I(struct inode *inode) { - return container_of(inode, struct cifsInodeInfo, vfs_inode); + return container_of(inode, struct cifsInodeInfo, netfs.inode); } static inline struct cifs_sb_info * @@ -1641,37 +1776,6 @@ struct file_list { struct cifsFileInfo *cfile; }; -/* - * common struct for holding inode info when searching for or updating an - * inode with new info - */ - -#define CIFS_FATTR_DFS_REFERRAL 0x1 -#define CIFS_FATTR_DELETE_PENDING 0x2 -#define CIFS_FATTR_NEED_REVAL 0x4 -#define CIFS_FATTR_INO_COLLISION 0x8 -#define CIFS_FATTR_UNKNOWN_NLINK 0x10 -#define CIFS_FATTR_FAKE_ROOT_INO 0x20 - -struct cifs_fattr { - u32 cf_flags; - u32 cf_cifsattrs; - u64 cf_uniqueid; - u64 cf_eof; - u64 cf_bytes; - u64 cf_createtime; - kuid_t cf_uid; - kgid_t cf_gid; - umode_t cf_mode; - dev_t cf_rdev; - unsigned int cf_nlink; - unsigned int cf_dtype; - struct timespec64 cf_atime; - struct timespec64 cf_mtime; - struct timespec64 cf_ctime; - u32 cf_cifstag; -}; - static inline void free_dfs_info_param(struct dfs_info3_param *param) { if (param) { @@ -1911,11 +2015,13 @@ extern mempool_t *cifs_mid_poolp; /* Operations for different SMB versions */ #define SMB1_VERSION_STRING "1.0" +#define SMB20_VERSION_STRING "2.0" +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; -#define SMB20_VERSION_STRING "2.0" extern struct smb_version_operations smb20_operations; extern struct smb_version_values smb20_values; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; @@ -1979,4 +2085,22 @@ static inline bool cifs_is_referral_server(struct cifs_tcon *tcon, return is_tcon_dfs(tcon) || (ref && (ref->flags & DFSREF_REFERRAL_SERVER)); } +static inline u64 cifs_flock_len(struct file_lock *fl) +{ + return fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; +} + +static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) +{ + if (WARN_ON_ONCE(!ses || !ses->server)) + return 0; + /* + * Make workstation name no more than 15 chars when using insecure dialects as some legacy + * servers do require it during NTLMSSP. + */ + if (ses->server->dialect <= SMB20_PROT_ID) + return min_t(size_t, sizeof(ses->workstation_name), RFC1001_NAME_LEN_WITH_NULL); + return sizeof(ses->workstation_name); +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 0df3b24a0bf4..d59aebefa71c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -619,6 +619,15 @@ unsigned int cifs_ses_get_chan_index(struct cifs_ses *ses, struct TCP_Server_Info *server); void +cifs_chan_set_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void +cifs_chan_clear_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +bool +cifs_chan_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); void @@ -627,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses, bool cifs_chan_needs_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server); +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); +int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 47e927c4ff8d..6371b9eebdad 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -75,7 +75,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) { + if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) { spin_unlock(&cifs_tcp_ses_lock); return; } @@ -2558,7 +2558,8 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + - le64_to_cpu(parm_data->length) - 1; + (le64_to_cpu(parm_data->length) ? + le64_to_cpu(parm_data->length) - 1 : 0); pLockData->fl_pid = -le32_to_cpu(parm_data->pid); } } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 42e14f408856..fa29c9aae24b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) if (!server->hostname) return -EINVAL; + /* if server hostname isn't populated, there's nothing to do here */ + if (server->hostname[0] == '\0') + return 0; + len = strlen(server->hostname) + 3; unc = kmalloc(len, GFP_KERNEL); @@ -141,6 +145,25 @@ requeue_resolve: return rc; } +static void smb2_query_server_interfaces(struct work_struct *work) +{ + int rc; + struct cifs_tcon *tcon = container_of(work, + struct cifs_tcon, + query_interfaces.work); + + /* + * query server network interfaces, in case they change + */ + rc = SMB3_request_interfaces(0, tcon); + if (rc) { + cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); + } + + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); +} static void cifs_resolve_server(struct work_struct *work) { @@ -148,7 +171,7 @@ static void cifs_resolve_server(struct work_struct *work) struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, resolve.work); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); /* * Resolve the hostname again to make sure that IP address is up-to-date. @@ -159,7 +182,7 @@ static void cifs_resolve_server(struct work_struct *work) __func__, rc); } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); } /* @@ -213,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { struct TCP_Server_Info *pserver; - struct cifs_ses *ses; + struct cifs_ses *ses, *nses; struct cifs_tcon *tcon; /* @@ -227,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { + /* check if iface is still active */ + if (!cifs_chan_is_iface_active(ses, server)) { + /* + * HACK: drop the lock before calling + * cifs_chan_update_iface to avoid deadlock + */ + ses->ses_count++; + spin_unlock(&cifs_tcp_ses_lock); + cifs_chan_update_iface(ses, server); + spin_lock(&cifs_tcp_ses_lock); + ses->ses_count--; + } + spin_lock(&ses->chan_lock); if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) goto next_session; @@ -241,7 +277,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) goto next_session; - ses->status = CifsNeedReconnect; + ses->ses_status = SES_NEED_RECON; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; @@ -267,7 +303,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) /* do not want to be sending data on a socket we are freeing */ cifs_dbg(FYI, "%s: tearing down socket\n", __func__); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (server->ssocket) { cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state, server->ssocket->flags); @@ -296,7 +332,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) mid->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_entry_safe(mid, nmid, &retry_list, qhead) { @@ -306,9 +342,9 @@ cifs_abort_connection(struct TCP_Server_Info *server) } if (cifs_rdma_enabled(server)) { - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); smbd_destroy(server); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); } } @@ -359,7 +395,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, do { try_to_freeze(); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (!cifs_swn_set_server_dstaddr(server)) { /* resolve the hostname again to make sure that IP address is up-to-date */ @@ -372,7 +408,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, else rc = generic_ip_connect(server); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc); msleep(3000); } else { @@ -383,7 +419,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, server->tcpStatus = CifsNeedNegotiate; spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } } while (server->tcpStatus == CifsNeedReconnect); @@ -488,12 +524,12 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) do { try_to_freeze(); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = reconnect_target_unlocked(server, &tl, &target_hint); if (rc) { /* Failed to reconnect socket */ - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc); msleep(3000); continue; @@ -510,7 +546,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); @@ -1565,7 +1601,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); INIT_LIST_HEAD(&tcp_ses->pending_mid_q); - mutex_init(&tcp_ses->srv_mutex); + mutex_init(&tcp_ses->_srv_mutex); memcpy(tcp_ses->workstation_RFC1001_name, ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); memcpy(tcp_ses->server_RFC1001_name, @@ -1789,7 +1825,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out; } - cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid); + cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid); ses->tcon_ipc = tcon; out: @@ -1828,7 +1864,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->status == CifsExiting) + if (ses->ses_status == SES_EXITING) continue; if (!match_session(ses, ctx)) continue; @@ -1845,10 +1881,9 @@ void cifs_put_smb_ses(struct cifs_ses *ses) unsigned int rc, xid; unsigned int chan_count; struct TCP_Server_Info *server = ses->server; - cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); - if (ses->status == CifsExiting) { + if (ses->ses_status == SES_EXITING) { spin_unlock(&cifs_tcp_ses_lock); return; } @@ -1864,13 +1899,13 @@ void cifs_put_smb_ses(struct cifs_ses *ses) /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); - if (ses->status == CifsGood) - ses->status = CifsExiting; + if (ses->ses_status == SES_GOOD) + ses->ses_status = SES_EXITING; spin_unlock(&cifs_tcp_ses_lock); cifs_free_ipc(ses); - if (ses->status == CifsExiting && server->ops->logoff) { + if (ses->ses_status == SES_EXITING && server->ops->logoff) { xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) @@ -1891,9 +1926,11 @@ void cifs_put_smb_ses(struct cifs_ses *ses) int i; for (i = 1; i < chan_count; i++) { - spin_unlock(&ses->chan_lock); + if (ses->chans[i].iface) { + kref_put(&ses->chans[i].iface->refcount, release_iface); + ses->chans[i].iface = NULL; + } cifs_put_tcp_session(ses->chans[i].server, 0); - spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } @@ -2037,18 +2074,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } } - ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL); - if (!ctx->workstation_name) { - cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n"); - rc = -ENOMEM; - kfree(ctx->username); - ctx->username = NULL; - kfree_sensitive(ctx->password); - ctx->password = NULL; - kfree(ctx->domainname); - ctx->domainname = NULL; - goto out_key_put; - } + strscpy(ctx->workstation_name, ses->workstation_name, sizeof(ctx->workstation_name)); out_key_put: up_read(&key->sem); @@ -2090,7 +2116,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses = cifs_find_smb_ses(server, ctx); if (ses) { cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", - ses->status); + ses->ses_status); spin_lock(&ses->chan_lock); if (cifs_chan_needs_reconnect(ses, server)) { @@ -2157,12 +2183,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) if (!ses->domainName) goto get_ses_fail; } - if (ctx->workstation_name) { - ses->workstation_name = kstrdup(ctx->workstation_name, - GFP_KERNEL); - if (!ses->workstation_name) - goto get_ses_fail; - } + + strscpy(ses->workstation_name, ctx->workstation_name, sizeof(ses->workstation_name)); + if (ctx->domainauto) ses->domainAuto = ctx->domainauto; ses->cred_uid = ctx->cred_uid; @@ -2281,6 +2304,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); + /* cancel polling of interfaces */ + cancel_delayed_work_sync(&tcon->query_interfaces); + if (tcon->use_witness) { int rc; @@ -2509,6 +2535,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) */ tcon->retry = ctx->retry; tcon->nocase = ctx->nocase; + tcon->broken_sparse_sup = ctx->no_sparse; if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) tcon->nohandlecache = ctx->nohandlecache; else @@ -2517,6 +2544,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); + /* schedule query interfaces poll */ + INIT_DELAYED_WORK(&tcon->query_interfaces, + smb2_query_server_interfaces); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3420,8 +3453,9 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, } /* - * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is, - * otherwise 0. + * Check if path is remote (i.e. a DFS share). + * + * Return -EREMOTE if it is, otherwise 0 or -errno. */ static int is_path_remote(struct mount_ctx *mnt_ctx) { @@ -3432,6 +3466,9 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) struct cifs_tcon *tcon = mnt_ctx->tcon; struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; char *full_path; +#ifdef CONFIG_CIFS_DFS_UPCALL + bool nodfs = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS; +#endif if (!server->ops->is_path_accessible) return -EOPNOTSUPP; @@ -3449,14 +3486,20 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, full_path); #ifdef CONFIG_CIFS_DFS_UPCALL + if (nodfs) { + if (rc == -EREMOTE) + rc = -EOPNOTSUPP; + goto out; + } + + /* path *might* exist with non-ASCII characters in DFS root + * try again with full path (only if nodfs is not set) */ if (rc == -ENOENT && is_tcon_dfs(tcon)) rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, full_path); #endif - if (rc != 0 && rc != -EREMOTE) { - kfree(full_path); - return rc; - } + if (rc != 0 && rc != -EREMOTE) + goto out; if (rc != -EREMOTE) { rc = cifs_are_all_path_components_accessible(server, xid, tcon, @@ -3468,6 +3511,7 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) } } +out: kfree(full_path); return rc; } @@ -3703,6 +3747,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) if (!isdfs) goto out; + /* proceed as DFS mount */ uuid_gen(&mnt_ctx.mount_id); rc = connect_dfs_root(&mnt_ctx, &tl); dfs_cache_free_tgts(&tl); @@ -3960,7 +4005,7 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, if (rc == 0) { spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsInNegotiate) - server->tcpStatus = CifsNeedSessSetup; + server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; spin_unlock(&cifs_tcp_ses_lock); @@ -3980,22 +4025,39 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; bool is_binding = false; - /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if ((server->tcpStatus != CifsNeedSessSetup) && - (ses->status == CifsGood)) { + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + + if (ses->ses_status != SES_GOOD && + ses->ses_status != SES_NEW && + ses->ses_status != SES_NEED_RECON) { spin_unlock(&cifs_tcp_ses_lock); return 0; } - server->tcpStatus = CifsInSessSetup; - spin_unlock(&cifs_tcp_ses_lock); + /* only send once per connect */ spin_lock(&ses->chan_lock); + if (CIFS_ALL_CHANS_GOOD(ses) || + cifs_chan_in_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + cifs_chan_set_in_reconnect(ses, server); spin_unlock(&ses->chan_lock); + if (!is_binding) + ses->ses_status = SES_IN_SETUP; + spin_unlock(&cifs_tcp_ses_lock); + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) @@ -4019,20 +4081,21 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsInSessSetup) - server->tcpStatus = CifsNeedSessSetup; + if (ses->ses_status == SES_IN_SETUP) + ses->ses_status = SES_NEED_RECON; + spin_lock(&ses->chan_lock); + cifs_chan_clear_in_reconnect(ses, server); + spin_unlock(&ses->chan_lock); spin_unlock(&cifs_tcp_ses_lock); } else { spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsInSessSetup) - server->tcpStatus = CifsGood; - /* Even if one channel is active, session is in good state */ - ses->status = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - + if (ses->ses_status == SES_IN_SETUP) + ses->ses_status = SES_GOOD; spin_lock(&ses->chan_lock); + cifs_chan_clear_in_reconnect(ses, server); cifs_chan_clear_need_reconnect(ses, server); spin_unlock(&ses->chan_lock); + spin_unlock(&cifs_tcp_ses_lock); } return rc; @@ -4497,7 +4560,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || + if (tcon->ses->ses_status != SES_GOOD || (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); @@ -4565,7 +4628,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || + if (tcon->ses->ses_status != SES_GOOD || (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 956f8e5cf3e7..34a8f3baed5e 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -654,7 +654,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h return ce; } } - return ERR_PTR(-EEXIST); + return ERR_PTR(-ENOENT); } /* @@ -662,7 +662,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h * * Use whole path components in the match. Must be called with htable_rw_lock held. * - * Return ERR_PTR(-EEXIST) if the entry is not found. + * Return ERR_PTR(-ENOENT) if the entry is not found. */ static struct cache_entry *lookup_cache_entry(const char *path) { @@ -710,7 +710,7 @@ static struct cache_entry *lookup_cache_entry(const char *path) while (e > s && *e != sep) e--; } - return ERR_PTR(-EEXIST); + return ERR_PTR(-ENOENT); } /** @@ -1229,6 +1229,30 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id) kref_put(&mg->refcount, mount_group_release); } +/* Extract share from DFS target and return a pointer to prefix path or NULL */ +static const char *parse_target_share(const char *target, char **share) +{ + const char *s, *seps = "/\\"; + size_t len; + + s = strpbrk(target + 1, seps); + if (!s) + return ERR_PTR(-EINVAL); + + len = strcspn(s + 1, seps); + if (!len) + return ERR_PTR(-EINVAL); + s += len; + + len = s - target + 1; + *share = kstrndup(target, len, GFP_KERNEL); + if (!*share) + return ERR_PTR(-ENOMEM); + + s = target + len; + return s + strspn(s, seps); +} + /** * dfs_cache_get_tgt_share - parse a DFS target * @@ -1242,56 +1266,46 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id) int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share, char **prefix) { - char *s, sep, *p; - size_t len; - size_t plen1, plen2; + char sep; + char *target_share; + char *ppath = NULL; + const char *target_ppath, *dfsref_ppath; + size_t target_pplen, dfsref_pplen; + size_t len, c; if (!it || !path || !share || !prefix || strlen(path) < it->it_path_consumed) return -EINVAL; - *share = NULL; - *prefix = NULL; - sep = it->it_name[0]; if (sep != '\\' && sep != '/') return -EINVAL; - s = strchr(it->it_name + 1, sep); - if (!s) - return -EINVAL; + target_ppath = parse_target_share(it->it_name, &target_share); + if (IS_ERR(target_ppath)) + return PTR_ERR(target_ppath); - /* point to prefix in target node */ - s = strchrnul(s + 1, sep); + /* point to prefix in DFS referral path */ + dfsref_ppath = path + it->it_path_consumed; + dfsref_ppath += strspn(dfsref_ppath, "/\\"); - /* extract target share */ - *share = kstrndup(it->it_name, s - it->it_name, GFP_KERNEL); - if (!*share) - return -ENOMEM; + target_pplen = strlen(target_ppath); + dfsref_pplen = strlen(dfsref_ppath); - /* skip separator */ - if (*s) - s++; - /* point to prefix in DFS path */ - p = path + it->it_path_consumed; - if (*p == sep) - p++; - - /* merge prefix paths from DFS path and target node */ - plen1 = it->it_name + strlen(it->it_name) - s; - plen2 = path + strlen(path) - p; - if (plen1 || plen2) { - len = plen1 + plen2 + 2; - *prefix = kmalloc(len, GFP_KERNEL); - if (!*prefix) { - kfree(*share); - *share = NULL; + /* merge prefix paths from DFS referral path and target node */ + if (target_pplen || dfsref_pplen) { + len = target_pplen + dfsref_pplen + 2; + ppath = kzalloc(len, GFP_KERNEL); + if (!ppath) { + kfree(target_share); return -ENOMEM; } - if (plen1) - scnprintf(*prefix, len, "%.*s%c%.*s", (int)plen1, s, sep, (int)plen2, p); - else - strscpy(*prefix, p, len); + c = strscpy(ppath, target_ppath, len); + if (c && dfsref_pplen) + ppath[c] = sep; + strlcat(ppath, dfsref_ppath, len); } + *share = target_share; + *prefix = ppath; return 0; } @@ -1327,9 +1341,9 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n", __func__, ip); } else { - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); } kfree(ip); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 06003bb9cbe9..e64cda7a7610 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1395,7 +1395,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) cifs_dbg(VFS, "Can't push all brlocks!\n"); break; } - length = 1 + flock->fl_end - flock->fl_start; + length = cifs_flock_len(flock); if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) type = CIFS_RDLCK; else @@ -1511,7 +1511,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, bool wait_flag, bool posix_lck, unsigned int xid) { int rc = 0; - __u64 length = 1 + flock->fl_end - flock->fl_start; + __u64 length = cifs_flock_len(flock); struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -1609,7 +1609,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; - __u64 length = 1 + flock->fl_end - flock->fl_start; + __u64 length = cifs_flock_len(flock); struct list_head tmp_llist; INIT_LIST_HEAD(&tmp_llist); @@ -1713,7 +1713,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, unsigned int xid) { int rc = 0; - __u64 length = 1 + flock->fl_end - flock->fl_start; + __u64 length = cifs_flock_len(flock); struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -2004,7 +2004,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -2060,7 +2060,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, return rc; } - cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -2777,8 +2777,11 @@ int cifs_flush(struct file *file, fl_owner_t id) rc = filemap_write_and_wait(inode->i_mapping); cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc); - if (rc) + if (rc) { + /* get more nuanced writeback errors */ + rc = filemap_check_wb_err(file->f_mapping, 0); trace_cifs_flush_err(inode->i_ino, rc); + } return rc; } @@ -4666,14 +4669,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) /* This inode is open for write at least once */ struct cifs_sb_info *cifs_sb; - cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb); + cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { /* since no page cache to corrupt on directio we can change size safely */ return true; } - if (i_size_read(&cifsInode->vfs_inode) < end_of_file) + if (i_size_read(&cifsInode->netfs.inode) < end_of_file) return true; return false; @@ -4906,6 +4909,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis, cifs_dbg(FYI, "swap activate\n"); + if (!swap_file->f_mapping->a_ops->swap_rw) + /* Cannot support swap */ + return -EINVAL; + spin_lock(&inode->i_lock); blocks = inode->i_blocks; isize = inode->i_size; @@ -4934,7 +4941,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis, * from reading or writing the file */ - return 0; + sis->flags |= SWP_FS_OPS; + return add_swap_extent(sis, 0, sis->max, 0); } static void cifs_swap_deactivate(struct file *file) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index a92e9eec521f..8dc0d923ef6a 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -119,6 +119,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), + fsparam_flag("nosparse", Opt_nosparse), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), @@ -312,7 +313,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->password = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; - new_ctx->workstation_name = NULL; new_ctx->UNC = NULL; new_ctx->source = NULL; new_ctx->iocharset = NULL; @@ -327,7 +327,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); - DUP_CTX_STR(workstation_name); DUP_CTX_STR(nodename); DUP_CTX_STR(iocharset); @@ -766,8 +765,7 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } - if (new_ctx->workstation_name && - (!old_ctx->workstation_name || strcmp(new_ctx->workstation_name, old_ctx->workstation_name))) { + if (strcmp(new_ctx->workstation_name, old_ctx->workstation_name)) { cifs_errorf(fc, "can not change workstation_name during remount\n"); return -EINVAL; } @@ -814,7 +812,6 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, username); STEAL_STRING(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); - STEAL_STRING(cifs_sb, ctx, workstation_name); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -943,6 +940,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_nolease: ctx->no_lease = 1; break; + case Opt_nosparse: + ctx->no_sparse = 1; + break; case Opt_nodelete: ctx->nodelete = 1; break; @@ -1467,22 +1467,15 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, int smb3_init_fs_context(struct fs_context *fc) { - int rc; struct smb3_fs_context *ctx; char *nodename = utsname()->nodename; int i; ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL); - if (unlikely(!ctx)) { - rc = -ENOMEM; - goto err_exit; - } + if (unlikely(!ctx)) + return -ENOMEM; - ctx->workstation_name = kstrdup(nodename, GFP_KERNEL); - if (unlikely(!ctx->workstation_name)) { - rc = -ENOMEM; - goto err_exit; - } + strscpy(ctx->workstation_name, nodename, sizeof(ctx->workstation_name)); /* * does not have to be perfect mapping since field is @@ -1555,14 +1548,6 @@ int smb3_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &smb3_fs_context_ops; return 0; - -err_exit: - if (ctx) { - kfree(ctx->workstation_name); - kfree(ctx); - } - - return rc; } void @@ -1588,8 +1573,6 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->source = NULL; kfree(ctx->domainname); ctx->domainname = NULL; - kfree(ctx->workstation_name); - ctx->workstation_name = NULL; kfree(ctx->nodename); ctx->nodename = NULL; kfree(ctx->iocharset); diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index e54090d9ef36..5f093cb7e9b9 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -62,6 +62,7 @@ enum cifs_param { Opt_noblocksend, Opt_noautotune, Opt_nolease, + Opt_nosparse, Opt_hard, Opt_soft, Opt_perm, @@ -170,7 +171,7 @@ struct smb3_fs_context { char *server_hostname; char *UNC; char *nodename; - char *workstation_name; + char workstation_name[CIFS_MAX_WORKSTATION_LEN]; char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ @@ -222,6 +223,7 @@ struct smb3_fs_context { bool noautotune:1; bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ bool no_lease:1; /* disable requesting leases */ + bool no_sparse:1; /* do not attempt to set files sparse */ bool fsc:1; /* enable fscache */ bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a638b29e9062..23ef56f55ce5 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); + cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd); - cifsi->netfs_ctx.cache = + cifsi->netfs.cache = fscache_acquire_cookie(tcon->fscache, 0, &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), - i_size_read(&cifsi->vfs_inode)); + i_size_read(&cifsi->netfs.inode)); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) @@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) if (cookie) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie); fscache_relinquish_cookie(cookie, false); - cifsi->netfs_ctx.cache = NULL; + cifsi->netfs.cache = NULL; } } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 52355c0912ae..aa3b941a5555 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -52,16 +52,16 @@ void cifs_fscache_fill_coherency(struct inode *inode, struct cifsInodeInfo *cifsi = CIFS_I(inode); memset(cd, 0, sizeof(*cd)); - cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); - cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); - cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); - cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); + cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec); } static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - return netfs_i_cookie(inode); + return netfs_i_cookie(&CIFS_I(inode)->netfs); } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 2f9e7d2f81b6..81da81e18553 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -115,7 +115,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) __func__, cifs_i->uniqueid); set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); /* Invalidate fscache cookie */ - cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); + cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd); fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); } @@ -2499,7 +2499,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, u64 len) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; struct cifsFileInfo *cfile; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index afaf59c22193..0e84e6fcf8ab 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -69,12 +69,13 @@ sesInfoAlloc(void) ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL); if (ret_buf) { atomic_inc(&sesInfoAllocCount); - ret_buf->status = CifsNew; + ret_buf->ses_status = SES_NEW; ++ret_buf->ses_count; INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); spin_lock_init(&ret_buf->iface_lock); + INIT_LIST_HEAD(&ret_buf->iface_list); spin_lock_init(&ret_buf->chan_lock); } return ret_buf; @@ -83,6 +84,8 @@ sesInfoAlloc(void) void sesInfoFree(struct cifs_ses *buf_to_free) { + struct cifs_server_iface *iface = NULL, *niface = NULL; + if (buf_to_free == NULL) { cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); return; @@ -95,9 +98,12 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree_sensitive(buf_to_free->password); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); - kfree(buf_to_free->workstation_name); kfree_sensitive(buf_to_free->auth_key.response); - kfree(buf_to_free->iface_list); + spin_lock(&buf_to_free->iface_lock); + list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list, + iface_head) + kref_put(&iface->refcount, release_iface); + spin_unlock(&buf_to_free->iface_lock); kfree_sensitive(buf_to_free); } @@ -114,6 +120,8 @@ tconInfoAlloc(void) kfree(ret_buf); return NULL; } + INIT_LIST_HEAD(&ret_buf->crfid.dirents.entries); + mutex_init(&ret_buf->crfid.dirents.de_mutex); atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; @@ -536,11 +544,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == OPLOCK_EXCLUSIVE) { cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == OPLOCK_READ) { cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else cinode->oplock = 0; } @@ -1210,18 +1218,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void .data = data, .sb = NULL, }; + struct file_system_type **fs_type = (struct file_system_type *[]) { + &cifs_fs_type, &smb3_fs_type, NULL, + }; - iterate_supers_type(&cifs_fs_type, f, &sd); - - if (!sd.sb) - return ERR_PTR(-EINVAL); - /* - * Grab an active reference in order to prevent automounts (DFS links) - * of expiring and then freeing up our cifs superblock pointer while - * we're doing failover. - */ - cifs_sb_active(sd.sb); - return sd.sb; + for (; *fs_type; fs_type++) { + iterate_supers_type(*fs_type, f, &sd); + if (sd.sb) { + /* + * Grab an active reference in order to prevent automounts (DFS links) + * of expiring and then freeing up our cifs superblock pointer while + * we're doing failover. + */ + cifs_sb_active(sd.sb); + return sd.sb; + } + } + return ERR_PTR(-EINVAL); } static void __cifs_put_super(struct super_block *sb) @@ -1309,7 +1322,7 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) * for "\<server>\<dfsname>\<linkpath>" DFS reference, * where <dfsname> contains non-ASCII unicode symbols. * - * Check such DFS reference and emulate -ENOENT if it is actual. + * Check such DFS reference. */ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, struct cifs_tcon *tcon, @@ -1341,10 +1354,6 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n", dfspath); rc = -EREMOTE; - } else if (rc == -EEXIST) { - cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n", - dfspath); - rc = -ENOENT; } else { cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1929e80c09ee..384cabdf47ca 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -840,9 +840,109 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, return rc; } +static bool emit_cached_dirents(struct cached_dirents *cde, + struct dir_context *ctx) +{ + struct cached_dirent *dirent; + int rc; + + list_for_each_entry(dirent, &cde->entries, entry) { + if (ctx->pos >= dirent->pos) + continue; + ctx->pos = dirent->pos; + rc = dir_emit(ctx, dirent->name, dirent->namelen, + dirent->fattr.cf_uniqueid, + dirent->fattr.cf_dtype); + if (!rc) + return rc; + } + return true; +} + +static void update_cached_dirents_count(struct cached_dirents *cde, + struct dir_context *ctx) +{ + if (cde->ctx != ctx) + return; + if (cde->is_valid || cde->is_failed) + return; + + cde->pos++; +} + +static void finished_cached_dirents_count(struct cached_dirents *cde, + struct dir_context *ctx) +{ + if (cde->ctx != ctx) + return; + if (cde->is_valid || cde->is_failed) + return; + if (ctx->pos != cde->pos) + return; + + cde->is_valid = 1; +} + +static void add_cached_dirent(struct cached_dirents *cde, + struct dir_context *ctx, + const char *name, int namelen, + struct cifs_fattr *fattr) +{ + struct cached_dirent *de; + + if (cde->ctx != ctx) + return; + if (cde->is_valid || cde->is_failed) + return; + if (ctx->pos != cde->pos) { + cde->is_failed = 1; + return; + } + de = kzalloc(sizeof(*de), GFP_ATOMIC); + if (de == NULL) { + cde->is_failed = 1; + return; + } + de->namelen = namelen; + de->name = kstrndup(name, namelen, GFP_ATOMIC); + if (de->name == NULL) { + kfree(de); + cde->is_failed = 1; + return; + } + de->pos = ctx->pos; + + memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr)); + + list_add_tail(&de->entry, &cde->entries); +} + +static bool cifs_dir_emit(struct dir_context *ctx, + const char *name, int namelen, + struct cifs_fattr *fattr, + struct cached_fid *cfid) +{ + bool rc; + ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); + + rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype); + if (!rc) + return rc; + + if (cfid) { + mutex_lock(&cfid->dirents.de_mutex); + add_cached_dirent(&cfid->dirents, ctx, name, namelen, + fattr); + mutex_unlock(&cfid->dirents.de_mutex); + } + + return rc; +} + static int cifs_filldir(char *find_entry, struct file *file, - struct dir_context *ctx, - char *scratch_buf, unsigned int max_len) + struct dir_context *ctx, + char *scratch_buf, unsigned int max_len, + struct cached_fid *cfid) { struct cifsFileInfo *file_info = file->private_data; struct super_block *sb = file_inode(file)->i_sb; @@ -851,7 +951,6 @@ static int cifs_filldir(char *find_entry, struct file *file, struct cifs_fattr fattr; struct qstr name; int rc = 0; - ino_t ino; rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, file_info->srch_inf.unicode); @@ -931,8 +1030,8 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_prime_dcache(file_dentry(file), &name, &fattr); - ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); + return !cifs_dir_emit(ctx, name.name, name.len, + &fattr, cfid); } @@ -941,8 +1040,9 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) int rc = 0; unsigned int xid; int i; + struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; - struct cifsFileInfo *cifsFile = NULL; + struct cifsFileInfo *cifsFile; char *current_entry; int num_to_fill = 0; char *tmp_buf = NULL; @@ -950,6 +1050,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) unsigned int max_len; const char *full_path; void *page = alloc_dentry_path(); + struct cached_fid *cfid = NULL; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); xid = get_xid(); @@ -959,6 +1061,54 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) goto rddir2_exit; } + if (file->private_data == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + goto cache_not_found; + tcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + tcon = tlink_tcon(cifsFile->tlink); + } + + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); + cifs_put_tlink(tlink); + if (rc) + goto cache_not_found; + + mutex_lock(&cfid->dirents.de_mutex); + /* + * If this was reading from the start of the directory + * we need to initialize scanning and storing the + * directory content. + */ + if (ctx->pos == 0 && cfid->dirents.ctx == NULL) { + cfid->dirents.ctx = ctx; + cfid->dirents.pos = 2; + } + /* + * If we already have the entire directory cached then + * we can just serve the cache. + */ + if (cfid->dirents.is_valid) { + if (!dir_emit_dots(file, ctx)) { + mutex_unlock(&cfid->dirents.de_mutex); + goto rddir2_exit; + } + emit_cached_dirents(&cfid->dirents, ctx); + mutex_unlock(&cfid->dirents.de_mutex); + goto rddir2_exit; + } + mutex_unlock(&cfid->dirents.de_mutex); + + /* Drop the cache while calling initiate_cifs_search and + * find_cifs_entry in case there will be reconnects during + * query_directory. + */ + close_cached_dir(cfid); + cfid = NULL; + + cache_not_found: /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. @@ -977,7 +1127,6 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) is in current search buffer? if it before then restart search if after then keep searching till find it */ - cifsFile = file->private_data; if (cifsFile->srch_inf.endOfSearch) { if (cifsFile->srch_inf.emptyDir) { @@ -993,12 +1142,18 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) tcon = tlink_tcon(cifsFile->tlink); rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path, ¤t_entry, &num_to_fill); + open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); if (rc) { cifs_dbg(FYI, "fce error %d\n", rc); goto rddir2_exit; } else if (current_entry != NULL) { cifs_dbg(FYI, "entry %lld found\n", ctx->pos); } else { + if (cfid) { + mutex_lock(&cfid->dirents.de_mutex); + finished_cached_dirents_count(&cfid->dirents, ctx); + mutex_unlock(&cfid->dirents.de_mutex); + } cifs_dbg(FYI, "Could not find entry\n"); goto rddir2_exit; } @@ -1028,7 +1183,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) */ *tmp_buf = 0; rc = cifs_filldir(current_entry, file, ctx, - tmp_buf, max_len); + tmp_buf, max_len, cfid); if (rc) { if (rc > 0) rc = 0; @@ -1036,6 +1191,12 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } ctx->pos++; + if (cfid) { + mutex_lock(&cfid->dirents.de_mutex); + update_cached_dirents_count(&cfid->dirents, ctx); + mutex_unlock(&cfid->dirents.de_mutex); + } + if (ctx->pos == cifsFile->srch_inf.index_of_last_entry) { cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", @@ -1050,6 +1211,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: + if (cfid) + close_cached_dir(cfid); free_dentry_path(page); free_xid(xid); return rc; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 32f478c7a66d..b85718f32b53 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - if (is_server_using_iface(ses->chans[i].server, iface)) { + if (ses->chans[i].iface == iface) { spin_unlock(&ses->chan_lock); return true; } @@ -81,11 +81,41 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, } /* If we didn't find the channel, it is likely a bug */ + if (server) + cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", + server->conn_id); WARN_ON(1); return 0; } void +cifs_chan_set_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + ses->chans[chan_index].in_reconnect = true; +} + +void +cifs_chan_clear_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + ses->chans[chan_index].in_reconnect = false; +} + +bool +cifs_chan_in_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return CIFS_CHAN_IN_RECONNECT(ses, chan_index); +} + +void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server) { @@ -116,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses, return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); } +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return ses->chans[chan_index].iface && + ses->chans[chan_index].iface->is_active; +} + /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { int old_chan_count, new_chan_count; int left; - int i = 0; int rc = 0; int tries = 0; - struct cifs_server_iface *ifaces = NULL; - size_t iface_count; + struct cifs_server_iface *iface = NULL, *niface = NULL; spin_lock(&ses->chan_lock); @@ -155,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) spin_unlock(&ses->chan_lock); /* - * Make a copy of the iface list at the time and use that - * instead so as to not hold the iface spinlock for opening - * channels - */ - spin_lock(&ses->iface_lock); - iface_count = ses->iface_count; - if (iface_count <= 0) { - spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "no iface list available to open channels\n"); - return 0; - } - ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), - GFP_ATOMIC); - if (!ifaces) { - spin_unlock(&ses->iface_lock); - return 0; - } - spin_unlock(&ses->iface_lock); - - /* * Keep connecting to same, fastest, iface for all channels as * long as its RSS. Try next fastest one if not RSS or channel * creation fails. */ + spin_lock(&ses->iface_lock); + iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + spin_unlock(&ses->iface_lock); + while (left > 0) { - struct cifs_server_iface *iface; tries++; if (tries > 3*ses->chan_max) { @@ -189,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) break; } - iface = &ifaces[i]; - if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { - i = (i+1) % iface_count; - continue; + spin_lock(&ses->iface_lock); + if (!ses->iface_count) { + spin_unlock(&ses->iface_lock); + break; } - rc = cifs_ses_add_channel(cifs_sb, ses, iface); - if (rc) { - cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", - i, rc); - i = (i+1) % iface_count; - continue; + list_for_each_entry_safe_from(iface, niface, &ses->iface_list, + iface_head) { + /* skip ifaces that are unusable */ + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + + /* take ref before unlock */ + kref_get(&iface->refcount); + + spin_unlock(&ses->iface_lock); + rc = cifs_ses_add_channel(cifs_sb, ses, iface); + spin_lock(&ses->iface_lock); + + if (rc) { + cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n", + &iface->sockaddr, + rc); + kref_put(&iface->refcount, release_iface); + continue; + } + + cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n", + &iface->sockaddr); + break; } + spin_unlock(&ses->iface_lock); - cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", - i); left--; new_chan_count++; } - kfree(ifaces); return new_chan_count - old_chan_count; } /* + * update the iface for the channel if necessary. + * will return 0 when iface is updated, 1 if removed, 2 otherwise + * Must be called with chan_lock held. + */ +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) +{ + unsigned int chan_index; + struct cifs_server_iface *iface = NULL; + struct cifs_server_iface *old_iface = NULL; + int rc = 0; + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (!chan_index) { + spin_unlock(&ses->chan_lock); + return 0; + } + + if (ses->chans[chan_index].iface) { + old_iface = ses->chans[chan_index].iface; + if (old_iface->is_active) { + spin_unlock(&ses->chan_lock); + return 1; + } + } + spin_unlock(&ses->chan_lock); + + spin_lock(&ses->iface_lock); + /* then look for a new one */ + list_for_each_entry(iface, &ses->iface_list, iface_head) { + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + kref_get(&iface->refcount); + } + + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + rc = 1; + iface = NULL; + cifs_dbg(FYI, "unable to find a suitable iface\n"); + } + + /* now drop the ref to the current iface */ + if (old_iface && iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", + &old_iface->sockaddr, + &iface->sockaddr); + } else if (old_iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "releasing ref to iface: %pIS\n", + &old_iface->sockaddr); + } else { + WARN_ON(!iface); + cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); + } + spin_unlock(&ses->iface_lock); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + ses->chans[chan_index].iface = iface; + + /* No iface is found. if secondary chan, drop connection */ + if (!iface && CIFS_SERVER_IS_CHAN(server)) + ses->chans[chan_index].server = NULL; + + spin_unlock(&ses->chan_lock); + + if (!iface && CIFS_SERVER_IS_CHAN(server)) + cifs_put_tcp_session(server, false); + + return rc; +} + +/* * If server is a channel of ses, return the corresponding enclosing * cifs_chan otherwise return NULL. */ @@ -274,7 +393,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* Auth */ ctx.domainauto = ses->domainAuto; ctx.domainname = ses->domainName; - ctx.server_hostname = ses->server->hostname; + + /* no hostname for extra channels */ + ctx.server_hostname = ""; + ctx.username = ses->user_name; ctx.password = ses->password; ctx.sectype = ses->sectype; @@ -322,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, spin_unlock(&ses->chan_lock); goto out; } + chan->iface = iface; ses->chan_count++; atomic_set(&ses->chan_seq, 0); @@ -714,9 +837,9 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size) else sz += sizeof(__le16); - if (ses->workstation_name) + if (ses->workstation_name[0]) sz += sizeof(__le16) * strnlen(ses->workstation_name, - CIFS_MAX_WORKSTATION_LEN); + ntlmssp_workstation_name_size(ses)); else sz += sizeof(__le16); @@ -960,7 +1083,7 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, cifs_security_buffer_from_str(&sec_blob->WorkstationName, ses->workstation_name, - CIFS_MAX_WORKSTATION_LEN, + ntlmssp_workstation_name_size(ses), *pbuffer, &tmp, nls_cp); @@ -1093,14 +1216,14 @@ sess_establish_session(struct sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; struct TCP_Server_Info *server = sess_data->server; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (!server->session_estab) { if (server->sign) { server->session_key.response = kmemdup(ses->auth_key.response, ses->auth_key.len, GFP_KERNEL); if (!server->session_key.response) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return -ENOMEM; } server->session_key.len = @@ -1109,7 +1232,7 @@ sess_establish_session(struct sess_data *sess_data) server->sequence_number = 0x2; server->session_estab = true; } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "CIFS session established successfully\n"); return 0; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index c71c9a44bef4..2e20ee4dab7b 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -38,10 +38,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst, in_buf->WordCount = 0; put_bcc(0, in_buf); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } @@ -55,7 +55,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst, if (rc < 0) server->sequence_number--; - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", get_mid(in_buf), rc); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index fe5bfa245fa7..8571a459c710 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -362,8 +362,6 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; if (cfile) { - cifsFileInfo_put(cfile); - cfile = NULL; rc = compound_send_recv(xid, ses, server, flags, num_rqst - 2, &rqst[1], &resp_buftype[1], @@ -514,8 +512,11 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (smb2_data == NULL) return -ENOMEM; + if (strcmp(full_path, "")) + rc = -ENOENT; + else + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); /* If it is a root and its handle is cached then use it */ - rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); if (!rc) { if (tcon->crfid.file_all_info_is_valid) { move_smb2_info_to_cifs(data, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 3fe47a88f47d..17813c3d0c6e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -656,6 +656,12 @@ smb2_is_valid_lease_break(char *buffer) } spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); + trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState), + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId), + *((u64 *)rsp->LeaseKey), + *((u64 *)&rsp->LeaseKey[8])); + return false; } @@ -726,6 +732,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No file id matched, oplock break ignored\n"); + trace_smb3_oplock_not_found(0 /* no xid */, rsp->PersistentFid, + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId)); + return true; } @@ -798,7 +808,7 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, if (tcon->ses) server = tcon->ses->server; - cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n", + cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", tcon->tid, persistent_fid, volatile_fid); return 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d6aaeff4a30a..8802995b2d3d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, size_t buf_len, - struct cifs_server_iface **iface_list, - size_t *iface_count) + struct cifs_ses *ses) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; struct sockaddr_in6 *addr6; struct iface_info_ipv4 *p4; struct iface_info_ipv6 *p6; - struct cifs_server_iface *info; + struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL; + struct cifs_server_iface tmp_iface; ssize_t bytes_left; size_t next = 0; int nb_iface = 0; - int rc = 0; - - *iface_list = NULL; - *iface_count = 0; - - /* - * Fist pass: count and sanity check - */ + int rc = 0, ret = 0; bytes_left = buf_len; p = buf; - while (bytes_left >= sizeof(*p)) { - nb_iface++; - next = le32_to_cpu(p->Next); - if (!next) { - bytes_left -= sizeof(*p); - break; - } - p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); - bytes_left -= next; - } - - if (!nb_iface) { - cifs_dbg(VFS, "%s: malformed interface info\n", __func__); - rc = -EINVAL; - goto out; - } - - /* Azure rounds the buffer size up 8, to a 16 byte boundary */ - if ((bytes_left > 8) || p->Next) - cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - + spin_lock(&ses->iface_lock); /* - * Second pass: extract info to internal structure + * Go through iface_list and do kref_put to remove + * any unused ifaces. ifaces in use will be removed + * when the last user calls a kref_put on it */ - - *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); - if (!*iface_list) { - rc = -ENOMEM; - goto out; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + iface->is_active = 0; + kref_put(&iface->refcount, release_iface); } + spin_unlock(&ses->iface_lock); - info = *iface_list; - bytes_left = buf_len; - p = buf; while (bytes_left >= sizeof(*p)) { - info->speed = le64_to_cpu(p->LinkSpeed); - info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; - info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; - - cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); - cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); - cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, - le32_to_cpu(p->Capability)); + memset(&tmp_iface, 0, sizeof(tmp_iface)); + tmp_iface.speed = le64_to_cpu(p->LinkSpeed); + tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; + tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; switch (p->Family) { /* @@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, * conversion explicit in case either one changes. */ case INTERNETWORK: - addr4 = (struct sockaddr_in *)&info->sockaddr; + addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr; p4 = (struct iface_info_ipv4 *)p->Buffer; addr4->sin_family = AF_INET; memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); @@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, &addr4->sin_addr); break; case INTERNETWORKV6: - addr6 = (struct sockaddr_in6 *)&info->sockaddr; + addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr; p6 = (struct iface_info_ipv6 *)p->Buffer; addr6->sin6_family = AF_INET6; memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); @@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, goto next_iface; } - (*iface_count)++; - info++; + /* + * The iface_list is assumed to be sorted by speed. + * Check if the new interface exists in that list. + * NEVER change iface. it could be in use. + * Add a new one instead + */ + spin_lock(&ses->iface_lock); + iface = niface = NULL; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + ret = iface_cmp(iface, &tmp_iface); + if (!ret) { + /* just get a ref so that it doesn't get picked/freed */ + iface->is_active = 1; + kref_get(&iface->refcount); + spin_unlock(&ses->iface_lock); + goto next_iface; + } else if (ret < 0) { + /* all remaining ifaces are slower */ + kref_get(&iface->refcount); + break; + } + } + spin_unlock(&ses->iface_lock); + + /* no match. insert the entry in the list */ + info = kmalloc(sizeof(struct cifs_server_iface), + GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto out; + } + memcpy(info, &tmp_iface, sizeof(tmp_iface)); + + /* add this new entry to the list */ + kref_init(&info->refcount); + info->is_active = 1; + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + spin_lock(&ses->iface_lock); + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_add_tail(&info->iface_head, &iface->iface_head); + kref_put(&iface->refcount, release_iface); + } else + list_add_tail(&info->iface_head, &ses->iface_list); + spin_unlock(&ses->iface_lock); + + ses->iface_count++; + ses->iface_last_update = jiffies; next_iface: + nb_iface++; next = le32_to_cpu(p->Next); - if (!next) + if (!next) { + bytes_left -= sizeof(*p); break; + } p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); bytes_left -= next; } - if (!*iface_count) { + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); rc = -EINVAL; goto out; } -out: - if (rc) { - kfree(*iface_list); - *iface_count = 0; - *iface_list = NULL; - } - return rc; -} + /* Azure rounds the buffer size up 8, to a 16 byte boundary */ + if ((bytes_left > 8) || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); -static int compare_iface(const void *ia, const void *ib) -{ - const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; - const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; - return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); + if (!ses->iface_count) { + rc = -EINVAL; + goto out; + } + +out: + return rc; } -static int +int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; struct network_interface_info_ioctl_rsp *out_buf = NULL; - struct cifs_server_iface *iface_list; - size_t iface_count; struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, - &iface_list, &iface_count); + rc = parse_server_interfaces(out_buf, ret_data_len, ses); if (rc) goto out; - /* sort interfaces from fastest to slowest */ - sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); - - spin_lock(&ses->iface_lock); - kfree(ses->iface_list); - ses->iface_list = iface_list; - ses->iface_count = iface_count; - ses->iface_last_update = jiffies; - spin_unlock(&ses->iface_lock); - out: kfree(out_buf); return rc; @@ -699,6 +706,7 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); + struct cached_dirent *dirent, *q; if (cfid->is_valid) { cifs_dbg(FYI, "clear cached root file handle\n"); @@ -718,6 +726,21 @@ smb2_close_cached_fid(struct kref *ref) dput(cfid->dentry); cfid->dentry = NULL; } + /* + * Delete all cached dirent names + */ + mutex_lock(&cfid->dirents.de_mutex); + list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { + list_del(&dirent->entry); + kfree(dirent->name); + kfree(dirent); + } + cfid->dirents.is_valid = 0; + cfid->dirents.is_failed = 0; + cfid->dirents.ctx = NULL; + cfid->dirents.pos = 0; + mutex_unlock(&cfid->dirents.de_mutex); + } void close_cached_dir(struct cached_fid *cfid) @@ -754,14 +777,15 @@ smb2_cached_lease_break(struct work_struct *work) /* * Open the and cache a directory handle. * Only supported for the root handle. + * If error then *cfid is not initialized. */ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cached_fid **cfid) { - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; + struct cifs_ses *ses; + struct TCP_Server_Info *server; struct cifs_open_parms oparms; struct smb2_create_rsp *o_rsp = NULL; struct smb2_query_info_rsp *qi_rsp = NULL; @@ -776,9 +800,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid; struct dentry *dentry; - if (tcon->nohandlecache) + if (tcon == NULL || tcon->nohandlecache || + is_smb1_server(tcon->ses->server)) return -ENOTSUPP; + ses = tcon->ses; + server = ses->server; + if (cifs_sb->root == NULL) return -ENOENT; @@ -824,7 +852,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms.tcon = tcon; - oparms.create_options = cifs_create_options(cifs_sb, 0); + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE); oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; oparms.fid = pfid; @@ -2695,7 +2723,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); - rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid); + if (!strcmp(path, "")) + open_cached_dir(xid, tcon, path, cifs_sb, &cfid); /* cfid null if open dir failed */ memset(&open_iov, 0, sizeof(open_iov)); rqst[0].rq_iov = open_iov; @@ -3837,7 +3866,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if (rc) goto out; - if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) + if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) smb2_set_sparse(xid, tcon, cfile, inode, false); eof = cpu_to_le64(off + len); @@ -4238,15 +4267,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { cinode->oplock = CIFS_CACHE_RHW_FLG; cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { cinode->oplock = CIFS_CACHE_RW_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == SMB2_OPLOCK_LEVEL_II) { cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else cinode->oplock = 0; } @@ -4285,7 +4314,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, - &cinode->vfs_inode); + &cinode->netfs.inode); } static void @@ -4323,11 +4352,13 @@ smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, } } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static bool smb2_is_read_op(__u32 oplock) { return oplock == SMB2_OPLOCK_LEVEL_II; } +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ static bool smb21_is_read_op(__u32 oplock) @@ -5426,7 +5457,7 @@ out: return rc; } - +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -5525,6 +5556,7 @@ struct smb_version_operations smb20_operations = { .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, }; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ struct smb_version_operations smb21_operations = { .compare_fids = smb2_compare_fids, @@ -5856,6 +5888,7 @@ struct smb_version_operations smb311_operations = { .is_network_name_deleted = smb2_is_network_name_deleted, }; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_version_values smb20_values = { .version_string = SMB20_VERSION_STRING, .protocol_id = SMB20_PROT_ID, @@ -5876,6 +5909,7 @@ struct smb_version_values smb20_values = { .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, .create_lease_size = sizeof(struct create_lease), }; +#endif /* ALLOW_INSECURE_LEGACY */ struct smb_version_values smb21_values = { .version_string = SMB21_VERSION_STRING, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1b7ad0c09566..12b4dddaedb0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -179,7 +179,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } } spin_unlock(&cifs_tcp_ses_lock); - if ((!tcon->ses) || (tcon->ses->status == CifsExiting) || + if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) || (!tcon->ses->server) || !server) return -EIO; @@ -288,6 +288,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; goto failed; + } else if (rc) { + mutex_unlock(&ses->session_mutex); + goto out; } } else { mutex_unlock(&ses->session_mutex); @@ -540,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { char *pneg_ctxt; + char *hostname = NULL; unsigned int ctxt_len, neg_context_count; if (*total_len > 200) { @@ -567,16 +571,24 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += ctxt_len; pneg_ctxt += ctxt_len; - ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, - server->hostname); - *total_len += ctxt_len; - pneg_ctxt += ctxt_len; - build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); pneg_ctxt += sizeof(struct smb2_posix_neg_context); - neg_context_count = 4; + /* + * secondary channels don't have the hostname field populated + * use the hostname field in the primary channel instead + */ + hostname = CIFS_SERVER_IS_CHAN(server) ? + server->primary_server->hostname : server->hostname; + if (hostname && (hostname[0] != 0)) { + ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, + hostname); + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; + neg_context_count = 4; + } else /* second channels do not have a hostname */ + neg_context_count = 3; if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) @@ -1369,13 +1381,13 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; struct TCP_Server_Info *server = sess_data->server; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (server->ops->generate_signingkey) { rc = server->ops->generate_signingkey(ses, server); if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } } @@ -1383,7 +1395,7 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) server->sequence_number = 0x2; server->session_estab = true; } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); return rc; @@ -3899,7 +3911,8 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id); spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsNeedNegotiate) { + if (server->ops->need_neg && + server->ops->need_neg(server)) { spin_unlock(&cifs_tcp_ses_lock); /* No need to send echo on newly established connections */ mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -5150,6 +5163,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); + trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof)); + return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, 1, &data, &size); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index d8c4388b190d..f57881b8464f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -260,28 +260,6 @@ struct get_retrieval_pointers_refcount_rsp { struct smb3_extents extents[]; } __packed; -struct fsctl_set_integrity_information_req { - __le16 ChecksumAlgorithm; - __le16 Reserved; - __le32 Flags; -} __packed; - -struct fsctl_get_integrity_information_rsp { - __le16 ChecksumAlgorithm; - __le16 Reserved; - __le32 Flags; - __le32 ChecksumChunkSizeInBytes; - __le32 ClusterSizeInBytes; -} __packed; - -/* Integrity ChecksumAlgorithm choices for above */ -#define CHECKSUM_TYPE_NONE 0x0000 -#define CHECKSUM_TYPE_CRC64 0x0002 -#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ - -/* Integrity flags for above */ -#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 - /* See MS-DFSC 2.2.2 */ struct fsctl_get_dfs_referral_req { __le16 MaxReferralLevel; diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 2af79093b78b..55e79f6ee78d 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -641,7 +641,8 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (!is_signed) return 0; spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus == CifsNeedNegotiate) { + if (server->ops->need_neg && + server->ops->need_neg(server)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -779,7 +780,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, return -EAGAIN; } - if (ses->status == CifsNew) { + if (ses->ses_status == SES_NEW) { if ((shdr->Command != SMB2_SESSION_SETUP) && (shdr->Command != SMB2_NEGOTIATE)) { spin_unlock(&cifs_tcp_ses_lock); @@ -788,7 +789,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, /* else ok - we are setting up session */ } - if (ses->status == CifsExiting) { + if (ses->ses_status == SES_EXITING) { if (shdr->Command != SMB2_LOGOFF) { spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 31ef64eb7fbb..5fbbec22bcc8 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -649,7 +649,7 @@ static int smbd_ia_open( smbd_max_frmr_depth, info->id->device->attrs.max_fast_reg_page_list_len); info->mr_type = IB_MR_TYPE_MEM_REG; - if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) info->mr_type = IB_MR_TYPE_SG_GAPS; info->pd = ib_alloc_pd(info->id->device, 0); @@ -1350,7 +1350,7 @@ void smbd_destroy(struct TCP_Server_Info *server) wait_event(info->wait_send_pending, atomic_read(&info->send_pending) == 0); - /* It's not posssible for upper layer to get to reassembly */ + /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); do { spin_lock_irqsave(&info->reassembly_queue_lock, flags); @@ -1382,9 +1382,9 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "freeing mr list\n"); wake_up_interruptible_all(&info->wait_mr); while (atomic_read(&info->mr_used_count)) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); msleep(1000); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); } destroy_mr_list(info); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index bc279616c513..6b88dc2e364f 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); DEFINE_SMB3_RW_DONE_EVENT(zero_done); DEFINE_SMB3_RW_DONE_EVENT(falloc_done); +/* For logging successful set EOF (truncate) */ +DECLARE_EVENT_CLASS(smb3_eof_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset), + TP_ARGS(xid, fid, tid, sesid, offset), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset) +) + +#define DEFINE_SMB3_EOF_EVENT(name) \ +DEFINE_EVENT(smb3_eof_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset), \ + TP_ARGS(xid, fid, tid, sesid, offset)) + +DEFINE_SMB3_EOF_EVENT(set_eof); + /* * For handle based calls other than read and write, and get/set info */ @@ -158,6 +196,7 @@ DEFINE_SMB3_FD_EVENT(flush_enter); DEFINE_SMB3_FD_EVENT(flush_done); DEFINE_SMB3_FD_EVENT(close_enter); DEFINE_SMB3_FD_EVENT(close_done); +DEFINE_SMB3_FD_EVENT(oplock_not_found); DECLARE_EVENT_CLASS(smb3_fd_err_class, TP_PROTO(unsigned int xid, @@ -814,6 +853,7 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high)) DEFINE_SMB3_LEASE_DONE_EVENT(lease_done); +DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found); DECLARE_EVENT_CLASS(smb3_lease_err_class, TP_PROTO(__u32 lease_state, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c667e6ddfe2f..bfc9bd55870a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -726,7 +726,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { spin_lock(&cifs_tcp_ses_lock); - if (ses->status == CifsNew) { + if (ses->ses_status == SES_NEW) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) { spin_unlock(&cifs_tcp_ses_lock); @@ -735,7 +735,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, /* else ok - we are setting up session */ } - if (ses->status == CifsExiting) { + if (ses->ses_status == SES_EXITING) { /* check if SMB session is bad because we are setting it up */ if (in_buf->Command != SMB_COM_LOGOFF_ANDX) { spin_unlock(&cifs_tcp_ses_lock); @@ -822,7 +822,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, } else instance = exist_credits->instance; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); /* * We can't use credits obtained from the previous session to send this @@ -830,14 +830,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, * return -EAGAIN in such cases to let callers handle it. */ if (instance != server->reconnect_instance) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); add_credits_and_wake_if(server, &credits, optype); return -EAGAIN; } mid = server->ops->setup_async_request(server, rqst); if (IS_ERR(mid)) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); add_credits_and_wake_if(server, &credits, optype); return PTR_ERR(mid); } @@ -868,7 +868,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_delete_mid(mid); } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc == 0) return 0; @@ -1109,7 +1109,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * of smb data. */ - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); /* * All the parts of the compound chain belong obtained credits from the @@ -1119,7 +1119,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * handle it. */ if (instance != server->reconnect_instance) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); for (j = 0; j < num_rqst; j++) add_credits(server, &credits[j], optype); return -EAGAIN; @@ -1131,7 +1131,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, revert_current_mid(server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); /* Update # of requests on wire to server */ for (j = 0; j < num_rqst; j++) @@ -1163,7 +1163,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, server->sequence_number -= 2; } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); /* * If sending failed for some reason or it is an oplock break that we @@ -1187,12 +1187,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * Compounding is never used during session establish. */ spin_lock(&cifs_tcp_ses_lock); - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { spin_unlock(&cifs_tcp_ses_lock); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); spin_lock(&cifs_tcp_ses_lock); } @@ -1260,15 +1260,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * Compounding is never used during session establish. */ spin_lock(&cifs_tcp_ses_lock); - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len }; spin_unlock(&cifs_tcp_ses_lock); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); smb311_update_preauth_hash(ses, server, &iov, 1); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); spin_lock(&cifs_tcp_ses_lock); } spin_unlock(&cifs_tcp_ses_lock); @@ -1385,11 +1385,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, and avoid races inside tcp sendmsg code that could cause corruption of smb data */ - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); /* Update # of requests on wire to server */ add_credits(server, &credits, 0); return rc; @@ -1397,7 +1397,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); goto out; } @@ -1411,7 +1411,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) server->sequence_number -= 2; - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc < 0) goto out; @@ -1530,18 +1530,18 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, and avoid races inside tcp sendmsg code that could cause corruption of smb data */ - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { cifs_delete_mid(midQ); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } @@ -1554,7 +1554,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc < 0) server->sequence_number -= 2; - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc < 0) { cifs_delete_mid(midQ); @@ -24,6 +24,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <linux/rmap.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS @@ -721,7 +722,8 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter int id; id = dax_read_lock(); - rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; @@ -789,95 +791,12 @@ static void *dax_insert_entry(struct xa_state *xas, return entry; } -static inline -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) -{ - unsigned long address; - - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - return address; -} - -/* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, - unsigned long pfn) -{ - struct vm_area_struct *vma; - pte_t pte, *ptep = NULL; - pmd_t *pmdp = NULL; - spinlock_t *ptl; - - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - struct mmu_notifier_range range; - unsigned long address; - - cond_resched(); - - if (!(vma->vm_flags & VM_SHARED)) - continue; - - address = pgoff_address(index, vma); - - /* - * follow_invalidate_pte() will use the range to call - * mmu_notifier_invalidate_range_start() on our behalf before - * taking any lock. - */ - if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, - &pmdp, &ptl)) - continue; - - /* - * No need to call mmu_notifier_invalidate_range() as we are - * downgrading page table protection not changing it to point - * to a new page. - * - * See Documentation/vm/mmu_notifier.rst - */ - if (pmdp) { -#ifdef CONFIG_FS_DAX_PMD - pmd_t pmd; - - if (pfn != pmd_pfn(*pmdp)) - goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) - goto unlock_pmd; - - flush_cache_page(vma, address, pfn); - pmd = pmdp_invalidate(vma, address, pmdp); - pmd = pmd_wrprotect(pmd); - pmd = pmd_mkclean(pmd); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); -unlock_pmd: -#endif - spin_unlock(ptl); - } else { - if (pfn != pte_pfn(*ptep)) - goto unlock_pte; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock_pte; - - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); -unlock_pte: - pte_unmap_unlock(ptep, ptl); - } - - mmu_notifier_invalidate_range_end(&range); - } - i_mmap_unlock_read(mapping); -} - static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn, index, count; + unsigned long pfn, index, count, end; long ret = 0; + struct vm_area_struct *vma; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -935,8 +854,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, pfn = dax_to_pfn(entry); count = 1UL << dax_entry_order(entry); index = xas->xa_index & ~(count - 1); + end = index + count - 1; + + /* Walk all mappings of a given index of a file and writeprotect them */ + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { + pfn_mkclean_range(pfn, count, index, vma); + cond_resched(); + } + i_mmap_unlock_read(mapping); - dax_entry_mkclean(mapping, index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1013,7 +940,7 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - NULL, pfnp); + DAX_ACCESS, NULL, pfnp); if (length < 0) { rc = length; goto out; @@ -1122,7 +1049,7 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret > 0) { memset(kaddr + offset, 0, size); dax_flush(dax_dev, kaddr + offset, size); @@ -1239,6 +1166,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; + bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { @@ -1247,7 +1175,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), - &kaddr, NULL); + DAX_ACCESS, &kaddr, NULL); + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + map_len = dax_direct_access(dax_dev, pgoff, + PHYS_PFN(size), DAX_RECOVERY_WRITE, + &kaddr, NULL); + if (map_len > 0) + recovery = true; + } if (map_len < 0) { ret = map_len; break; @@ -1259,7 +1194,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - if (iov_iter_rw(iter) == WRITE) + if (recovery) + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, + map_len, iter); + else if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index a5cc4ed2cd0d..8e01d89c3319 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -17,6 +17,7 @@ static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space rreq->start = start; rreq->len = len; rreq->mapping = mapping; + rreq->inode = mapping->host; INIT_LIST_HEAD(&rreq->subrequests); refcount_set(&rreq->ref, 1); return rreq; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index bcc8335b46b3..95a403720e8c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -288,7 +288,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - err = z_erofs_fill_inode(inode); + if (!erofs_is_fscache_mode(inode->i_sb)) + err = z_erofs_fill_inode(inode); + else + err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 95efc127b2ba..724bb57075f6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -199,7 +199,6 @@ struct z_erofs_decompress_frontend { struct z_erofs_pagevec_ctor vector; struct z_erofs_pcluster *pcl, *tailpcl; - struct z_erofs_collection *cl; /* a pointer used to pick up inplace I/O pages */ struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; @@ -214,7 +213,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED } + .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); @@ -357,7 +356,7 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, return false; } -/* callers must be with collection lock held */ +/* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct page *page, enum z_erofs_page_type type, bool pvec_safereuse) @@ -372,7 +371,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, pvec_safereuse); - fe->cl->vcnt += (unsigned int)ret; + fe->pcl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; } @@ -405,12 +404,11 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { struct z_erofs_pcluster *pcl = fe->pcl; - struct z_erofs_collection *cl; unsigned int length; /* to avoid unexpected loop formed by corrupted images */ @@ -419,8 +417,7 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, return -EFSCORRUPTED; } - cl = z_erofs_primarycollection(pcl); - if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -443,23 +440,21 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, length = READ_ONCE(pcl->length); } } - mutex_lock(&cl->lock); + mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) fe->tailpcl = pcl; z_erofs_try_to_claim_pcluster(fe); - fe->cl = cl; return 0; } -static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; - struct z_erofs_collection *cl; struct erofs_workgroup *grp; int err; @@ -482,17 +477,15 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; + pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = COLLECT_PRIMARY_FOLLOWED; - cl = z_erofs_primarycollection(pcl); - cl->pageofs = map->m_la & ~PAGE_MASK; - /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_init(&cl->lock); - DBG_BUGON(!mutex_trylock(&cl->lock)); + mutex_init(&pcl->lock); + DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ @@ -519,11 +512,10 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; - fe->cl = cl; return 0; err_out: - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); z_erofs_free_pcluster(pcl); return err; } @@ -535,9 +527,9 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, struct erofs_workgroup *grp; int ret; - DBG_BUGON(fe->cl); + DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); @@ -554,14 +546,14 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_collection(fe, inode, map); + ret = z_erofs_register_pcluster(fe, inode, map); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_collection(fe, inode, map); + ret = z_erofs_lookup_pcluster(fe, inode, map); if (ret) { erofs_workgroup_put(&fe->pcl->obj); return ret; @@ -569,7 +561,7 @@ tailpacking: out: z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->cl->pagevec, fe->cl->vcnt); + fe->pcl->pagevec, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -582,48 +574,36 @@ out: */ static void z_erofs_rcu_callback(struct rcu_head *head) { - struct z_erofs_collection *const cl = - container_of(head, struct z_erofs_collection, rcu); - - z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(head, + struct z_erofs_pcluster, rcu)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); - - call_rcu(&cl->rcu, z_erofs_rcu_callback); -} -static void z_erofs_collection_put(struct z_erofs_collection *cl) -{ - struct z_erofs_pcluster *const pcl = - container_of(cl, struct z_erofs_pcluster, primary_collection); - - erofs_workgroup_put(&pcl->obj); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) { - struct z_erofs_collection *cl = fe->cl; + struct z_erofs_pcluster *pcl = fe->pcl; - if (!cl) + if (!pcl) return false; z_erofs_pagevec_ctor_exit(&fe->vector, false); - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) - z_erofs_collection_put(cl); + erofs_workgroup_put(&pcl->obj); - fe->cl = NULL; + fe->pcl = NULL; return true; } @@ -663,28 +643,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, repeat: cur = end - 1; - /* lucky, within the range of the current map_blocks */ - if (offset + cur >= map->m_la && - offset + cur < map->m_la + map->m_llen) { - /* didn't get a valid collection previously (very rare) */ - if (!fe->cl) - goto restart_now; - goto hitted; - } - - /* go ahead the next map_blocks */ - erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); - - if (z_erofs_collector_end(fe)) - fe->backmost = false; + if (offset + cur < map->m_la || + offset + cur >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + cur); - map->m_la = offset + cur; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto err_out; + if (z_erofs_collector_end(fe)) + fe->backmost = false; + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto err_out; + } else { + if (fe->pcl) + goto hitted; + /* didn't get a valid pcluster previously (very rare) */ + } -restart_now: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; @@ -766,7 +741,7 @@ retry: /* bump up the number of spiltted parts of a page */ ++spiltted; /* also update nr_pages */ - fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1); + fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); next_part: /* can be used for verification */ map->m_llen = offset + cur - map->m_la; @@ -821,15 +796,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, enum z_erofs_page_type page_type; bool overlapped, partial; - struct z_erofs_collection *cl; int err; might_sleep(); - cl = z_erofs_primarycollection(pcl); - DBG_BUGON(!READ_ONCE(cl->nr_pages)); + DBG_BUGON(!READ_ONCE(pcl->nr_pages)); - mutex_lock(&cl->lock); - nr_pages = cl->nr_pages; + mutex_lock(&pcl->lock); + nr_pages = pcl->nr_pages; if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { pages = pages_onstack; @@ -857,9 +830,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, err = 0; z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, 0); + pcl->pagevec, 0); - for (i = 0; i < cl->vcnt; ++i) { + for (i = 0; i < pcl->vcnt; ++i) { unsigned int pagenr; page = z_erofs_pagevec_dequeue(&ctor, &page_type); @@ -945,11 +918,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, goto out; llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { outputsize = llen; partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); } else { - outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; partial = true; } @@ -963,7 +936,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, .in = compressed_pages, .out = pages, .pageofs_in = pcl->pageofs_in, - .pageofs_out = cl->pageofs, + .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, .outputsize = outputsize, .alg = pcl->algorithmformat, @@ -1012,16 +985,12 @@ out: else if (pages != pages_onstack) kvfree(pages); - cl->nr_pages = 0; - cl->vcnt = 0; + pcl->nr_pages = 0; + pcl->vcnt = 0; - /* all cl locks MUST be taken before the following line */ + /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); - - /* all cl locks SHOULD be released right now */ - mutex_unlock(&cl->lock); - - z_erofs_collection_put(cl); + mutex_unlock(&pcl->lock); return err; } @@ -1043,6 +1012,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(pcl->next); z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + erofs_workgroup_put(&pcl->obj); } } @@ -1466,22 +1436,19 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct page *page; page = erofs_grab_cache_page_nowait(inode->i_mapping, index); - if (!page) - goto skip; - - if (PageUptodate(page)) { - unlock_page(page); + if (page) { + if (PageUptodate(page)) { + unlock_page(page); + } else { + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + } put_page(page); - goto skip; } - err = z_erofs_do_read_page(f, page, pagepool); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - put_page(page); -skip: if (cur < PAGE_SIZE) break; cur = (index << PAGE_SHIFT) - 1; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 800b11c53f57..58053bb5066f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -12,21 +12,40 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_NR_INLINE_PAGEVECS 3 +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only * for everyone else; * - * L: Field should be protected by pageset lock; + * L: Field should be protected by the pcluster lock; * * A: Field should be accessed / updated in atomic for parallelized code. */ -struct z_erofs_collection { +struct z_erofs_pcluster { + struct erofs_workgroup obj; struct mutex lock; + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + /* I: page offset of start position of decompression */ - unsigned short pageofs; + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; /* L: maximum relative page index in pagevec[] */ unsigned short nr_pages; @@ -41,29 +60,6 @@ struct z_erofs_collection { /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; -}; - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct z_erofs_collection primary_collection; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* A: lower limit of decompressed length and if full length or not */ - unsigned int length; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; union { /* I: physical cluster size in pages */ @@ -80,8 +76,6 @@ struct z_erofs_pcluster { struct page *compressed_pages[]; }; -#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) - /* let's avoid the valid 32-bit kernel addresses */ /* the chained workgroup has't submitted io (still open) */ diff --git a/fs/exec.c b/fs/exec.c index e3e55d5e0be1..0989fb8472a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; + struct mmu_gather tlb; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + tlb_gather_mmu(&tlb, mm); + ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); + tlb_finish_mmu(&tlb); + if (ret) goto out_unlock; BUG_ON(prev != vma); @@ -1308,9 +1312,7 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - if (me->flags & PF_KTHREAD) - free_kthread_struct(me); - me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; @@ -1955,6 +1957,10 @@ int kernel_execve(const char *kernel_filename, int fd = AT_FDCWD; int retval; + /* It is non-sense for kernel threads to call execve */ + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return -EINVAL; + filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 76acc3721951..c6eaf7e9ea74 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode, return -ENOENT; } - exfat_chain_dup(&olddir, &ei->dir); + exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi), + EXFAT_I(old_parent_inode)->flags); dentry = ei->entry; ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 0106eba46d5a..3ef80d000e13 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -525,7 +525,8 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); + nresult = lookup_one(mnt_user_ns(mnt), nbuf, + target_dir, strlen(nbuf)); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2c2f179b6977..43de293cef56 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -672,17 +672,14 @@ int ext2_empty_dir (struct inode * inode) void *page_addr = NULL; struct page *page = NULL; unsigned long i, npages = dir_pages(inode); - int dir_has_error = 0; for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ext2_get_page(inode, i, dir_has_error, &page_addr); + page = ext2_get_page(inode, i, 0, &page_addr); - if (IS_ERR(page)) { - dir_has_error = 1; - continue; - } + if (IS_ERR(page)) + goto not_empty; kaddr = page_addr; de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 9e1ecd89f47f..e6b932219803 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,7 +36,6 @@ #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> -#include <linux/dax.h> #include "ext2.h" #include "acl.h" #include "xattr.h" @@ -1550,7 +1549,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) if (IS_ERR(raw_inode)) return -EIO; - /* For fields not not tracking in the in-memory inode, + /* For fields not tracking in the in-memory inode, * initialise them to zero for new inodes. */ if (ei->i_state & EXT2_STATE_NEW) memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3dce7d058985..84c0eb55071d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -829,7 +829,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); } /* Maximum number of blocks we map for direct IO at once. */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9f12f29bc346..9e06334771a3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4104,6 +4104,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = size >> bsbits; start = start_off >> bsbits; + /* + * For tiny groups (smaller than 8MB) the chosen allocation + * alignment may be larger than group size. Make sure the + * alignment does not move allocation to a different group which + * makes mballoc fail assertions later. + */ + start = max(start, rounddown(ac->ac_o_ex.fe_logical, + (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; @@ -4176,7 +4185,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } rcu_read_unlock(); - if (start + size <= ac->ac_o_ex.fe_logical && + /* + * In this function "start" and "size" are normalized for better + * alignment and length such that we could preallocate more blocks. + * This normalization is done such that original request of + * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and + * "size" boundaries. + * (Note fe_len can be relaxed since FS block allocation API does not + * provide gurantee on number of contiguous blocks allocation since that + * depends upon free space left, etc). + * In case of inode pa, later we use the allocated blocks + * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated + * range of goal/best blocks [start, size] to put it at the + * ac_o_ex.fe_logical extent of this inode. + * (See ext4_mb_use_inode_pa() for more details) + */ + if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7a5353a8cfd7..42f590518b4c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -438,7 +438,7 @@ int ext4_ext_migrate(struct inode *inode) /* * Worst case we can touch the allocation bitmaps and a block - * group descriptor block. We do need need to worry about + * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 47d0ca4c795b..db4ba99d1ceb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1929,7 +1929,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; + unsigned continued; + int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 14695e2b5042..97fa7b4c645f 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -465,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * end_page_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 90a941d20dff..8b70a4701293 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; /* + * If the reserved GDT blocks is non-zero, the resize_inode feature + * should always be set. + */ + if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + !ext4_has_feature_resize_inode(sb)) { + ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); + return -EFSCORRUPTED; + } + + /* * If we are not using the primary superblock/GDT copy don't resize, * because the user tools have no way of handling this. Probably a * bad time to do it anyways. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 450c918d68fc..845f2f8aee5f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); @@ -1870,31 +1870,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es) } #endif -static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) -{ -#ifdef CONFIG_FS_ENCRYPTION - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - - err = fscrypt_set_test_dummy_encryption(sb, arg, - &sbi->s_dummy_enc_policy); - if (err) { - ext4_msg(sb, KERN_WARNING, - "Error while setting test dummy encryption [%d]", err); - return err; - } - ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); -#endif - return 0; -} - #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) -#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) @@ -1911,7 +1892,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; - char *test_dummy_enc_arg; + struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; @@ -1953,7 +1934,7 @@ static void ext4_fc_free(struct fs_context *fc) for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); - kfree(ctx->test_dummy_enc_arg); + fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } @@ -2029,6 +2010,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) } #endif +static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, + struct ext4_fs_context *ctx) +{ + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption option not supported"); + return -EINVAL; + } + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->dummy_enc_policy); + if (err == -EINVAL) { + ext4_msg(NULL, KERN_WARNING, + "Value of option \"%s\" is unrecognized", param->key); + } else if (err == -EEXIST) { + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + return err; +} + #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ @@ -2291,29 +2295,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (param->type == fs_value_is_flag) { - ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; - ctx->test_dummy_enc_arg = NULL; - return 0; - } - if (*param->string && - !(!strcmp(param->string, "v1") || - !strcmp(param->string, "v2"))) { - ext4_msg(NULL, KERN_WARNING, - "Value of option \"%s\" is unrecognized", - param->key); - return -EINVAL; - } - ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; - ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, - GFP_KERNEL); - return 0; -#else - ext4_msg(NULL, KERN_WARNING, - "test_dummy_encryption option not supported"); - return -EINVAL; -#endif + return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX @@ -2504,7 +2486,8 @@ parse_failed: if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; - ret = ext4_apply_options(fc, sb); + ext4_apply_options(fc, sb); + ret = 0; out_free: if (fc) { @@ -2673,11 +2656,11 @@ err_jquota_specified: static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { -#ifdef CONFIG_FS_ENCRYPTION const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; - if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)) + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { @@ -2691,14 +2674,46 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && - !sbi->s_dummy_enc_policy.policy) { + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; ext4_msg(NULL, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); + "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } -#endif /* CONFIG_FS_ENCRYPTION */ - return 0; + /* Also make sure s_mount_opts didn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + /* + * fscrypt_add_test_dummy_key() technically changes the super_block, so + * technically it should be delayed until ext4_apply_options() like the + * other changes. But since we never get here for remounts (see above), + * and this is the last chance to report errors, we do it here. + */ + err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); + if (err) + ext4_msg(NULL, KERN_WARNING, + "Error adding test dummy encryption key [%d]", err); + return err; +} + +static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, + struct super_block *sb) +{ + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) + return; + EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; + memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, @@ -2785,11 +2800,10 @@ fail_dax_change_remount: return ext4_check_quota_consistency(fc, sb); } -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; - int ret = 0; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; @@ -2825,11 +2839,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) #endif ext4_apply_quota_options(fc, sb); - - if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) - ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg); - - return ret; + ext4_apply_test_dummy_encryption(ctx, sb); } @@ -4552,9 +4562,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err < 0) goto failed_mount; - err = ext4_apply_options(fc, sb); - if (err < 0) - goto failed_mount; + ext4_apply_options(fc, sb); #if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { @@ -5302,14 +5310,6 @@ no_journal: err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } - /* - * Update the checksum after updating free space/inode - * counters. Otherwise the superblock can have an incorrect - * checksum in the buffer cache until it is written out and - * e2fsprogs programs trying to open a file system immediately - * after it is mounted can fail. - */ - ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); @@ -5367,6 +5367,14 @@ no_journal: EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + /* + * Update the checksum after updating free space/inode counters and + * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); @@ -5898,7 +5906,6 @@ static void ext4_update_super(struct super_block *sb) static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; - int error = 0; if (!sbh) return -EINVAL; @@ -5907,6 +5914,13 @@ static int ext4_commit_super(struct super_block *sb) ext4_update_super(sb); + lock_buffer(sbh); + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(sbh)) { + unlock_buffer(sbh); + return -EIO; + } + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -5921,17 +5935,21 @@ static int ext4_commit_super(struct super_block *sb) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - error = __sync_dirty_buffer(sbh, - REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); + get_bh(sbh); + /* Clear potential dirty bit if it was journalled update */ + clear_buffer_dirty(sbh); + sbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); + wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); + return -EIO; } - return error; + return 0; } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 042325349098..564e28a1aa94 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1895,11 +1895,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 456c1e89386a..6d8b2bf14de0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,13 +98,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs) { - if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) - set_ckpt_flags(sbi, CP_ERROR_FLAG); - } else { - sbi->metapage_eio_ofs = page->index; - sbi->metapage_eio_cnt = 0; - } + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -158,7 +152,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -196,7 +190,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); @@ -1010,9 +1004,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8f38c26bb16c..7fcbcf979737 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -585,6 +584,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { @@ -2564,7 +2591,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; @@ -2601,6 +2633,7 @@ got_it: err = -EFSCORRUPTED; goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2737,11 +2770,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { @@ -3314,6 +3342,100 @@ unlock_out: return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + return __reserve_data_block(cow_inode, index, blk_addr, + node_changed); + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { @@ -3321,7 +3443,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3332,14 +3454,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3387,7 +3501,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3443,8 +3561,6 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3488,8 +3604,12 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -3522,9 +3642,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) inode->i_ino == F2FS_COMPRESS_INO(sbi)) clear_page_private_data(&folio->page); - if (page_private_atomic(&folio->page)) - return f2fs_drop_inmem_page(inode, &folio->page); - folio_detach_private(folio); } @@ -3536,10 +3653,6 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - /* This is atomic written page, keep Private */ - if (page_private_atomic(&folio->page)) - return false; - sbi = F2FS_M_SB(folio->mapping); if (test_opt(sbi, COMPRESS_CACHE)) { struct inode *inode = folio->mapping->host; @@ -3565,18 +3678,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(&folio->page)) { - f2fs_register_inmem_page(inode, &folio->page); - return true; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return false; - } - if (!folio_test_dirty(folio)) { filemap_dirty_folio(mapping, folio); f2fs_update_dirty_folio(inode, folio); @@ -3656,42 +3757,14 @@ out: int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fcdf253cd211..c92625ef16d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,11 +91,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -167,8 +164,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,7 +291,6 @@ get_cache: sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -491,10 +485,6 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -519,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -623,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a0e51937d92e..d5bd7932fb64 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, #if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; - if (IS_CASEFOLDED(dir)) { + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 10d1f138d14f..d9bbecd008d2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -509,11 +509,11 @@ struct f2fs_filename { #if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode, if the directory is both - * casefolded and encrypted and its encryption key is unavailable, or if - * the filesystem is doing an internal operation where usr_fname is also - * NULL. In all these cases we fall back to treating the name as an - * opaque byte sequence. + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. */ struct fscrypt_str cf_name; #endif @@ -579,8 +579,8 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 -/* maximum retry of EIO'ed meta page */ -#define MAX_RETRY_META_PAGE_EIO 100 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ @@ -717,7 +717,6 @@ enum { enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -739,8 +738,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ @@ -753,7 +750,6 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -795,11 +791,9 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; @@ -1093,7 +1087,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1118,16 +1111,12 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1277,6 +1266,15 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1615,8 +1613,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ - pgoff_t metapage_eio_ofs; /* EIO page offset */ - int metapage_eio_cnt; /* EIO count */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -1719,7 +1717,6 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1750,9 +1747,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1763,7 +1758,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ struct completion s_kobj_unregister; @@ -2606,11 +2601,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; @@ -3173,6 +3174,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); @@ -3203,16 +3208,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); -} - static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); @@ -3445,6 +3440,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3580,11 +3577,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3726,6 +3720,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3787,8 +3782,7 @@ extern const struct iomap_ops f2fs_iomap_ops; int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); @@ -3816,7 +3810,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3834,7 +3827,7 @@ struct f2fs_stat_info { int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3846,7 +3839,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -3946,17 +3938,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -4018,9 +3999,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -4053,6 +4031,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, @@ -4422,8 +4401,7 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } @@ -4431,8 +4409,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) @@ -4540,6 +4518,21 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 100637b1adb3..bd14cef1b08f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -372,7 +372,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); @@ -1437,11 +1438,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); @@ -1638,6 +1647,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1675,8 +1689,8 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) goto out_err; } @@ -1766,6 +1780,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; @@ -1804,16 +1822,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1828,8 +1838,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * before dropping file lock, it needs to do in ->flush. */ if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1992,6 +2002,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2014,44 +2025,55 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2076,127 +2098,20 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_start_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - +unlock_out: inode_unlock(inode); - mnt_drop_write_file(filp); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -2437,6 +2352,10 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2462,7 +2381,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2471,6 +2392,12 @@ out: static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2498,8 +2425,8 @@ do_more: f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2674,6 +2601,7 @@ do_map: } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; @@ -2913,6 +2841,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2956,7 +2889,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) @@ -3017,7 +2952,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -3037,7 +2972,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3987,7 +3922,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || @@ -4010,11 +3945,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4082,11 +4012,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4136,11 +4061,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: @@ -4328,17 +4251,39 @@ out: static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (f2fs_should_use_dio(inode, iocb, to)) - return f2fs_dio_read_iter(iocb, to); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } - ret = filemap_read(iocb, to, 0); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4630,14 +4575,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from, dio); - if (preallocated < 0) + if (preallocated < 0) { ret = preallocated; - else + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync): f2fs_buffered_write_iter(iocb, from); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea5b93b689cd..d5fb426e0747 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,6 +35,10 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -141,8 +145,12 @@ do_gc: if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) + if (f2fs_gc(sbi, &gc_control)) wait_ms = gc_th->no_gc_sleep_time; if (foreground) @@ -646,6 +654,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -787,6 +843,9 @@ retry: if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -1194,18 +1253,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - - if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1344,18 +1394,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1475,11 +1516,19 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; + } + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); @@ -1699,23 +1748,21 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1726,7 +1773,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1743,8 +1789,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1754,54 +1799,69 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) - skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; + } - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; } + } - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) - ret = f2fs_write_checkpoint(sbi, &cpc); + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1816,7 +1876,7 @@ stop: put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 3cb1e7a24740..049ce50cec9b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len) /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also - * @fname->cf_name if the filename is valid Unicode. + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { @@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that - * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. Note - * that to handle encrypted directories, the fallback must use - * usr_fname (plaintext) rather than disk_name (ciphertext). + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a578bf83b803..bf46a7dfbea2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -14,21 +14,40 @@ #include "node.h" #include <trace/events/f2fs.h> -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 83639238a1fe..fc55f5bd1fcc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); @@ -466,10 +465,10 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -745,9 +744,8 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -796,8 +794,22 @@ retry: f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index be599f31d3c4..d84c5f6cc09d 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) unsigned int cnt; struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) { unsigned long long iostat_diff[NR_IO_TYPE]; int i; + unsigned long flags; if (time_is_after_jiffies(sbi->iostat_next_period)) return; /* Need double check under the lock */ - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irq(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irq(&sbi->iostat_lock); - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irq(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irq(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes) { + unsigned long flags; + if (!sbi->iostat_enable) return; - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); sbi->rw_iostat[type] += io_bytes; if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) @@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); } @@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; + unsigned long flags; if (!sbi->iostat_enable) return; @@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, idx = WRITE_ASYNC_IO; } - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); io_lat->sum_lat[idx][iotype] += ts_diff; io_lat->bio_cnt[idx][iotype]++; if (ts_diff > io_lat->peak_lat[idx][iotype]) io_lat->peak_lat[idx][iotype] = ts_diff; - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); } void iostat_update_and_unbind_ctx(struct bio *bio, int rw) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5ed79b29999f..bf00d5057abb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; @@ -92,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); @@ -110,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_init_extent_tree(inode, NULL); - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -130,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, set_compress_context(inode); } + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -328,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, if (!is_extension_exist(name, ext[i], false)) continue; + /* Do not use inline_data with compression */ + stat_dec_inline_inode(inode); + clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; } @@ -461,6 +463,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -836,8 +845,8 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, - struct inode **whiteout) + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -851,7 +860,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -876,21 +885,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -911,7 +924,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, @@ -921,7 +934,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns, return -EIO; return __f2fs_tmpfile(mnt_userns, dir, NULL, - S_IFCHR | WHITEOUT_MODE, whiteout); + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); } static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ccff18560ff..cf6f7fc83c08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; @@ -1416,8 +1412,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1443,19 +1438,23 @@ repeat: goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1631,7 +1630,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4c1d34bfea78..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,7 +147,6 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7225ce09f3ab..874c1b9c41a2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -30,7 +30,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -185,301 +185,175 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct inmem_pages *new; - - set_page_private_atomic(page); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); + if (f2fs_is_atomic_file(inode)) { + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + clear_inode_flag(inode, FI_ATOMIC_FILE); - trace_f2fs_register_inmem_page(page, INMEM); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } } -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; - - list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; + struct dnode_of_data dn; + struct node_info ni; + int err; - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni, false); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); - - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + return err; } - return err; -} -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); + } else { + blkcnt_t count = 1; - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); + f2fs_put_dnode(&dn); + return 0; } -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) { - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - - f2fs_bug_on(sbi, !page_private_atomic(page)); + struct revoke_entry *cur, *tmp; - mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) - break; + list_for_each_entry_safe(cur, tmp, head, list) { + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); + list_del(&cur->list); + kmem_cache_free(revoke_entry_slab, cur); } - - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __f2fs_commit_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; + } - f2fs_wait_on_page_writeback(page, DATA, true, true); + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - unlock_page(page); - break; + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; - } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); - } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - return err; +out: + __complete_revoke_list(inode, &revoke_list, ret ? true : false); + + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -520,8 +394,15 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + f2fs_gc(sbi, &gc_control); } } } @@ -1664,33 +1545,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } @@ -3286,8 +3166,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -4084,10 +3963,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) @@ -4455,7 +4336,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, @@ -4480,8 +4361,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { /* build discard map only one time */ @@ -4521,15 +4402,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -4551,13 +4432,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -4637,6 +4529,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -5225,6 +5124,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } @@ -5335,9 +5235,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5356,5 +5256,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c94caf0c0a1..3f277dfcb131 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,7 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) @@ -224,10 +225,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { @@ -294,6 +295,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ @@ -572,11 +576,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; @@ -602,19 +605,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ed3e8b7a8260..37221e94e5ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -525,10 +525,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); + return 0; #else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; #endif - return 0; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -1339,9 +1340,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1382,9 +1380,8 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1707,18 +1704,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1731,14 +1733,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } @@ -2055,7 +2055,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - unsigned int gc_mode; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2066,14 +2066,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); - gc_mode = sbi->gc_mode; sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; @@ -2094,6 +2105,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } +skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); @@ -2684,7 +2696,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; - inode_lock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); /* * do_quotactl @@ -2703,7 +2716,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); - inode_unlock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); if (ret) break; @@ -3648,22 +3662,29 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); + return -EINVAL; + } + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, @@ -4070,30 +4091,9 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 65395ae188aa..7b8f2b41c29b 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -129,7 +129,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 02d4d4234956..a415c02ede39 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -126,6 +126,7 @@ struct msdos_inode_info { struct hlist_node i_fat_hash; /* hash by i_location */ struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ + struct timespec64 i_crtime; /* File creation (birth) time */ struct inode vfs_inode; }; @@ -433,8 +434,15 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) + +#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): " +#define fat_msg(sb, level, fmt, args...) \ +do { \ + printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\ + _fat_msg(sb, level, fmt, ##args); \ +} while (0) __printf(3, 4) __cold -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); #define fat_msg_ratelimit(sb, level, fmt, args...) \ do { \ if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ @@ -446,6 +454,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); +extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); +extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); extern int fat_update_time(struct inode *inode, struct timespec64 *now, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 978ac6751aeb..1db348f8f887 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -94,7 +94,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, err_brelse: brelse(bhs[0]); err: - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } @@ -107,8 +108,8 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", - (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; diff --git a/fs/fat/file.c b/fs/fat/file.c index bf91f977debe..3dae3ed60f3a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -398,13 +398,21 @@ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + generic_fillattr(mnt_userns, inode, stat); - stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + stat->blksize = sbi->cluster_size; - if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { /* Use i_pos for ino. This is used as fileid of nfs. */ - stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + stat->ino = fat_i_pos_read(sbi, inode); } + + if (sbi->options.isvfat && request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = MSDOS_I(inode)->i_crtime; + } + return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 69b4d4ae64d7..a38238d75c08 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -567,12 +567,13 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + inode->i_ctime = inode->i_mtime; if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, - de->cdate, de->ctime_cs); fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, + de->cdate, de->ctime_cs); } else - fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); + inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); return 0; } @@ -757,6 +758,8 @@ static struct inode *fat_alloc_inode(struct super_block *sb) ei->i_logstart = 0; ei->i_attrs = 0; ei->i_pos = 0; + ei->i_crtime.tv_sec = 0; + ei->i_crtime.tv_nsec = 0; return &ei->vfs_inode; } @@ -888,10 +891,10 @@ retry: &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, - &raw_entry->cdate, &raw_entry->ctime_cs); fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); + fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, + &raw_entry->cdate, &raw_entry->ctime_cs); } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); @@ -1885,10 +1888,8 @@ out_invalid: fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: - if (fsinfo_inode) - iput(fsinfo_inode); - if (fat_inode) - iput(fat_inode); + iput(fsinfo_inode); + iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); fat_reset_iocharset(&sbi->options); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 91ca3c304211..7e5d6ae305f2 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -42,10 +42,16 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) EXPORT_SYMBOL_GPL(__fat_fs_error); /** - * fat_msg() - print preformated FAT specific messages. Every thing what is - * not fat_fs_error() should be fat_msg(). + * _fat_msg() - Print a preformatted FAT message based on a superblock. + * @sb: A pointer to a &struct super_block + * @level: A Kernel printk level constant + * @fmt: The printf-style format string to print. + * + * Everything that is not fat_fs_error() should be fat_msg(). + * + * fat_msg() wraps _fat_msg() for printk indexing. */ -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -53,7 +59,7 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -187,7 +193,7 @@ static long days_in_year[] = { 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; -static inline int fat_tz_offset(struct msdos_sb_info *sbi) +static inline int fat_tz_offset(const struct msdos_sb_info *sbi) { return (sbi->options.tz_set ? -sbi->options.time_offset : @@ -275,23 +281,35 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } -static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) +/* + * truncate atime to 24 hour granularity (00:00:00 in local timezone) + */ +struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) +{ + /* to localtime */ + time64_t seconds = ts->tv_sec - fat_tz_offset(sbi); + s32 remainder; + + div_s64_rem(seconds, SECS_PER_DAY, &remainder); + /* to day boundary, and back to unix time */ + seconds = seconds + fat_tz_offset(sbi) - remainder; + + return (struct timespec64){ seconds, 0 }; +} + +/* + * truncate mtime to 2 second granularity + */ +struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) { - if (ts.tv_nsec) - ts.tv_nsec -= ts.tv_nsec % 10000000UL; - return ts; + return fat_timespec64_trunc_2secs(*ts); } /* * truncate the various times with appropriate granularity: - * root inode: - * all times always 0 - * all other inodes: - * mtime - 2 seconds - * ctime - * msdos - 2 seconds - * vfat - 10 milliseconds - * atime - 24 hours (00:00:00 in local timezone) + * all times in root node are always 0 */ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) { @@ -306,25 +324,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) ts = current_time(inode); } - if (flags & S_ATIME) { - /* to localtime */ - time64_t seconds = now->tv_sec - fat_tz_offset(sbi); - s32 remainder; - - div_s64_rem(seconds, SECS_PER_DAY, &remainder); - /* to day boundary, and back to unix time */ - seconds = seconds + fat_tz_offset(sbi) - remainder; - - inode->i_atime = (struct timespec64){ seconds, 0 }; - } - if (flags & S_CTIME) { - if (sbi->options.isvfat) - inode->i_ctime = fat_timespec64_trunc_10ms(*now); - else - inode->i_ctime = fat_timespec64_trunc_2secs(*now); - } + if (flags & S_ATIME) + inode->i_atime = fat_truncate_atime(sbi, now); + /* + * ctime and mtime share the same on-disk field, and should be + * identical in memory. all mtime updates will be applied to ctime, + * but ctime updates are ignored. + */ if (flags & S_MTIME) - inode->i_mtime = fat_timespec64_trunc_2secs(*now); + inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); return 0; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 5369d82e0bfb..c573314806cf 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -780,8 +780,6 @@ static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, goto out; } inode_inc_iversion(inode); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); out: @@ -878,8 +876,6 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } inode_inc_iversion(inode); set_nlink(inode, 2); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); diff --git a/fs/fcntl.c b/fs/fcntl.c index f15d885b9796..34a3faa4886d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ - if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) { - if (!filp->f_mapping || !filp->f_mapping->a_ops || - !filp->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if (!S_ISFIFO(inode->i_mode) && + (arg & O_DIRECT) && + !(filp->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); diff --git a/fs/file.c b/fs/file.c index ee9317346702..3bcc1ecc314a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -630,32 +630,23 @@ EXPORT_SYMBOL(fd_install); * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * - * If this functions returns an EINVAL error pointer the fd was beyond the - * current maximum number of file descriptors for that fdtable. + * Context: files_lock must be held. * - * Returns: The file associated with @fd, on error returns an error pointer. + * Returns: The file associated with @fd (NULL if @fd is not open) */ static struct file *pick_file(struct files_struct *files, unsigned fd) { + struct fdtable *fdt = files_fdtable(files); struct file *file; - struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) { - file = ERR_PTR(-EINVAL); - goto out_unlock; - } + if (fd >= fdt->max_fds) + return NULL; + file = fdt->fd[fd]; - if (!file) { - file = ERR_PTR(-EBADF); - goto out_unlock; + if (file) { + rcu_assign_pointer(fdt->fd[fd], NULL); + __put_unused_fd(files, fd); } - rcu_assign_pointer(fdt->fd[fd], NULL); - __put_unused_fd(files, fd); - -out_unlock: - spin_unlock(&files->file_lock); return file; } @@ -664,8 +655,10 @@ int close_fd(unsigned fd) struct files_struct *files = current->files; struct file *file; + spin_lock(&files->file_lock); file = pick_file(files, fd); - if (IS_ERR(file)) + spin_unlock(&files->file_lock); + if (!file) return -EBADF; return filp_close(file, files); @@ -702,20 +695,25 @@ static inline void __range_cloexec(struct files_struct *cur_fds, static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { + unsigned n; + + rcu_read_lock(); + n = last_fd(files_fdtable(cur_fds)); + rcu_read_unlock(); + max_fd = min(max_fd, n); + while (fd <= max_fd) { struct file *file; + spin_lock(&cur_fds->file_lock); file = pick_file(cur_fds, fd++); - if (!IS_ERR(file)) { + spin_unlock(&cur_fds->file_lock); + + if (file) { /* found a valid file to close */ filp_close(file, cur_fds); cond_resched(); - continue; } - - /* beyond the last fd in that table */ - if (PTR_ERR(file) == -EINVAL) - return; } } @@ -795,43 +793,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) * See close_fd_get_file() below, this variant assumes current->files->file_lock * is held. */ -int __close_fd_get_file(unsigned int fd, struct file **res) +struct file *__close_fd_get_file(unsigned int fd) { - struct files_struct *files = current->files; - struct file *file; - struct fdtable *fdt; - - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_err; - file = fdt->fd[fd]; - if (!file) - goto out_err; - rcu_assign_pointer(fdt->fd[fd], NULL); - __put_unused_fd(files, fd); - get_file(file); - *res = file; - return 0; -out_err: - *res = NULL; - return -ENOENT; + return pick_file(current->files, fd); } /* * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file, and then - * an fput(). + * The caller must ensure that filp_close() called on the file. */ -int close_fd_get_file(unsigned int fd, struct file **res) +struct file *close_fd_get_file(unsigned int fd) { struct files_struct *files = current->files; - int ret; + struct file *file; spin_lock(&files->file_lock); - ret = __close_fd_get_file(fd, res); + file = pick_file(files, fd); spin_unlock(&files->file_lock); - return ret; + return file; } void do_close_on_exec(struct files_struct *files) @@ -871,7 +851,7 @@ void do_close_on_exec(struct files_struct *files) } static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask, unsigned int refs) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -897,10 +877,9 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * Such a race can take two forms: * * (a) the file ref already went down to zero, - * and get_file_rcu_many() fails. Just try - * again: + * and get_file_rcu() fails. Just try again: */ - if (unlikely(!get_file_rcu_many(file, refs))) + if (unlikely(!get_file_rcu(file))) continue; /* @@ -909,11 +888,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * - * If so, we need to put our refs and try again. + * If so, we need to put our ref and try again. */ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { - fput_many(file, refs); + fput(file); continue; } @@ -926,37 +905,31 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, } static struct file *__fget_files(struct files_struct *files, unsigned int fd, - fmode_t mask, unsigned int refs) + fmode_t mask) { struct file *file; rcu_read_lock(); - file = __fget_files_rcu(files, fd, mask, refs); + file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } -static inline struct file *__fget(unsigned int fd, fmode_t mask, - unsigned int refs) -{ - return __fget_files(current->files, fd, mask, refs); -} - -struct file *fget_many(unsigned int fd, unsigned int refs) +static inline struct file *__fget(unsigned int fd, fmode_t mask) { - return __fget(fd, FMODE_PATH, refs); + return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { - return __fget(fd, FMODE_PATH, 1); + return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - return __fget(fd, 0, 1); + return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); @@ -966,7 +939,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) task_lock(task); if (task->files) - file = __fget_files(task->files, fd, 0, 1); + file = __fget_files(task->files, fd, 0); task_unlock(task); return file; @@ -1035,7 +1008,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask, 1); + file = __fget(fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; diff --git a/fs/file_table.c b/fs/file_table.c index ada8fe814db9..5424e3a8df5f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -368,9 +368,9 @@ EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); -void fput_many(struct file *file, unsigned int refs) +void fput(struct file *file) { - if (atomic_long_sub_and_test(refs, &file->f_count)) { + if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { @@ -389,11 +389,6 @@ void fput_many(struct file *file, unsigned int refs) } } -void fput(struct file *file) -{ - fput_many(file, 1); -} - /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h index a41ea0ba6943..bffd156d6434 100644 --- a/fs/freevxfs/vxfs.h +++ b/fs/freevxfs/vxfs.h @@ -1,32 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_SUPER_H_ #define _VXFS_SUPER_H_ diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index 1fd41cf98b9f..de2a5bccb930 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index acc5477b3f23..fbcd603365ad 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -1,31 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_DIR_H_ #define _VXFS_DIR_H_ diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index f5c428e21024..3a2180c5e208 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -1,31 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_EXTERN_H_ #define _VXFS_EXTERN_H_ diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c index a4610a77649e..c1174a3f8990 100644 --- a/fs/freevxfs/vxfs_fshead.c +++ b/fs/freevxfs/vxfs_fshead.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h index e026f0c49159..dfd2147599c4 100644 --- a/fs/freevxfs/vxfs_fshead.h +++ b/fs/freevxfs/vxfs_fshead.h @@ -1,32 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_FSHEAD_H_ #define _VXFS_FSHEAD_H_ diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index a37431e443d3..c2ef9f0debbd 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 1f41b25ef38b..ceb6a12649ba 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h index f012abed125d..1e9e138d2b33 100644 --- a/fs/freevxfs/vxfs_inode.h +++ b/fs/freevxfs/vxfs_inode.h @@ -1,32 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_INODE_H_ #define _VXFS_INODE_H_ diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index a51425634f65..f04ba2ed1e1a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 813da6685151..23f35187c289 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h index 0c0b0c9fa557..53afba08d617 100644 --- a/fs/freevxfs/vxfs_olt.h +++ b/fs/freevxfs/vxfs_olt.h @@ -1,31 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_OLT_H_ #define _VXFS_OLT_H_ diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index 6143ebab940d..0e633d2bfc7d 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 22eed5a73ac2..c3b82f716f9a 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a1074a26e784..05221366a16d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode, struct list_head *head) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); list_move(&inode->i_io_list, head); @@ -738,7 +739,7 @@ EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of - * an inode and transfers the ownership to it. To avoid unnnecessary + * an inode and transfers the ownership to it. To avoid unnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. @@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue, inode = wb_inode(delaying_queue->prev); if (inode_dirtied_after(inode, dirtied_before)) break; + spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - spin_lock(&inode->i_lock); inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) @@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue, goto out; } - /* Move inodes from one superblock together */ + /* + * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', + * we don't take inode->i_lock here because it is just a pointless overhead. + * Inode is already marked as I_SYNC_QUEUED so writeback list handling is + * fully under our control. + */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { @@ -1826,8 +1832,8 @@ static long writeback_sb_inodes(struct super_block *sb, * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ - spin_unlock(&inode->i_lock); requeue_io(inode, wb); + spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } @@ -2358,6 +2364,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; int dirtytime = 0; + struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); @@ -2410,13 +2417,24 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* + * Grab inode's wb early because it requires dropping i_lock and we + * need to make sure following checks happen atomically with dirty + * list handling so that we don't move inodes under flush worker's + * hands. + */ + if (!was_dirty) { + wb = locked_inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); + } + + /* * If the inode is queued for writeback by flush worker, just * update its dirty state. Once the flush worker is done with * the inode it will place it on the appropriate superblock * list, based upon its state. */ if (inode->i_state & I_SYNC_QUEUED) - goto out_unlock_inode; + goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's @@ -2424,22 +2442,19 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out_unlock_inode; + goto out_unlock; } if (inode->i_state & I_FREEING) - goto out_unlock_inode; + goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb; struct list_head *dirty_list; bool wakeup_bdi = false; - wb = locked_inode_to_wb_and_lock_list(inode); - inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; @@ -2453,6 +2468,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) dirty_list); spin_unlock(&wb->list_lock); + spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* @@ -2467,6 +2483,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) return; } } +out_unlock: + if (wb) + spin_unlock(&wb->list_lock); out_unlock_inode: spin_unlock(&inode->i_lock); } diff --git a/fs/fsopen.c b/fs/fsopen.c index 27a890aa493a..fc9d2d9fd234 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -119,7 +119,7 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) const char *fs_name; int ret; - if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; if (flags & ~FSOPEN_CLOEXEC) @@ -162,7 +162,7 @@ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags unsigned int lookup_flags; int ret; - if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; if ((flags & ~(FSPICK_CLOEXEC | diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index d7d3a7f06862..10eb50cbf398 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, - NULL); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), + DAX_ACCESS, NULL, NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 86b7dbb6a0d4..8db53fa67359 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -752,7 +752,8 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, * offset. */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -772,7 +773,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, long rc; void *kaddr; - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, + NULL); if (rc < 0) return rc; memset(kaddr, 0, nr_pages << PAGE_SHIFT); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e540d1290099..0b36a16659b6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -405,10 +405,13 @@ static void do_error(struct gfs2_glock *gl, const int ret) /** * demote_incompat_holders - demote incompatible demoteable holders * @gl: the glock we want to promote - * @new_gh: the new holder to be promoted + * @current_gh: the newly promoted holder + * + * We're passing the newly promoted holder in @current_gh, but actually, any of + * the strong holders would do. */ static void demote_incompat_holders(struct gfs2_glock *gl, - struct gfs2_holder *new_gh) + struct gfs2_holder *current_gh) { struct gfs2_holder *gh, *tmp; @@ -424,8 +427,10 @@ static void demote_incompat_holders(struct gfs2_glock *gl, */ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) return; + if (gh == current_gh) + continue; if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && - !may_grant(gl, new_gh, gh)) { + !may_grant(gl, current_gh, gh)) { /* * We should not recurse into do_promote because * __gfs2_glock_dq only calls handle_callback, @@ -478,8 +483,7 @@ find_first_strong_holder(struct gfs2_glock *gl) * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder * - * Returns: 0 if instantiate was successful, 2 if type specific operation is - * underway, or error. + * Returns: 0 if instantiate was successful, or error. */ int gfs2_instantiate(struct gfs2_holder *gh) { @@ -489,7 +493,7 @@ int gfs2_instantiate(struct gfs2_holder *gh) again: if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) - return 0; + goto done; /* * Since we unlock the lockref lock, we set a flag to indicate @@ -508,37 +512,36 @@ again: goto again; } - ret = glops->go_instantiate(gh); + ret = glops->go_instantiate(gl); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); - return ret; + if (ret) + return ret; + +done: + if (glops->go_held) + return glops->go_held(gh); + return 0; } /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * - * Returns: 1 if there is a blocked holder at the head of the list, or 2 - * if a type specific operation is underway. + * Returns: 1 if there is a blocked holder at the head of the list */ static int do_promote(struct gfs2_glock *gl) -__releases(&gl->gl_lockref.lock) -__acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh, *tmp, *first_gh; + struct gfs2_holder *gh, *current_gh; bool incompat_holders_demoted = false; - bool lock_released; - int ret; -restart: - first_gh = find_first_strong_holder(gl); - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - lock_released = false; + current_gh = find_first_strong_holder(gl); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; - if (!may_grant(gl, first_gh, gh)) { + if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this * holder for some reason. If this holder is at the @@ -550,37 +553,14 @@ restart: do_error(gl, 0); break; } - if (!incompat_holders_demoted) { - demote_incompat_holders(gl, first_gh); - incompat_holders_demoted = true; - first_gh = gh; - } - if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) && - !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) { - lock_released = true; - spin_unlock(&gl->gl_lockref.lock); - ret = gfs2_instantiate(gh); - spin_lock(&gl->gl_lockref.lock); - if (ret) { - if (ret == 1) - return 2; - gh->gh_error = ret; - list_del_init(&gh->gh_list); - trace_gfs2_glock_queue(gh, 0); - gfs2_holder_wake(gh); - goto restart; - } - } set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); - /* - * If we released the gl_lockref.lock the holders list may have - * changed. For that reason, we start again at the start of - * the holders queue. - */ - if (lock_released) - goto restart; + if (!incompat_holders_demoted) { + current_gh = gh; + demote_incompat_holders(gl, current_gh); + incompat_holders_demoted = true; + } } return 0; } @@ -658,7 +638,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh; unsigned state = ret & LM_OUT_ST_MASK; - int rv; spin_lock(&gl->gl_lockref.lock); trace_gfs2_glock_state_change(gl, state); @@ -716,6 +695,8 @@ retry: gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { + int rv; + spin_unlock(&gl->gl_lockref.lock); rv = glops->go_xmote_bh(gl); spin_lock(&gl->gl_lockref.lock); @@ -724,13 +705,10 @@ retry: goto out; } } - rv = do_promote(gl); - if (rv == 2) - goto out_locked; + do_promote(gl); } out: clear_bit(GLF_LOCK, &gl->gl_flags); -out_locked: spin_unlock(&gl->gl_lockref.lock); } @@ -887,7 +865,6 @@ __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_holder *gh = NULL; - int ret; if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) return; @@ -906,18 +883,14 @@ __acquires(&gl->gl_lockref.lock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - ret = do_promote(gl); - if (ret == 0) + if (do_promote(gl) == 0) goto out_unlock; - if (ret == 2) - goto out; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ } do_xmote(gl, gh, gl->gl_target); -out: return; out_sched: @@ -1315,6 +1288,25 @@ static void gfs2_glock_update_hold_time(struct gfs2_glock *gl, } /** + * gfs2_glock_holder_ready - holder is ready and its error code can be collected + * @gh: the glock holder + * + * Called when a glock holder no longer needs to be waited for because it is + * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has + * failed (gh_error != 0). + */ + +int gfs2_glock_holder_ready(struct gfs2_holder *gh) +{ + if (gh->gh_error || (gh->gh_flags & GL_SKIP)) + return gh->gh_error; + gh->gh_error = gfs2_instantiate(gh); + if (gh->gh_error) + gfs2_glock_dq(gh); + return gh->gh_error; +} + +/** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * @@ -1328,7 +1320,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); gfs2_glock_update_hold_time(gh->gh_gl, start_time); - return gh->gh_error; + return gfs2_glock_holder_ready(gh); } static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs) @@ -1356,7 +1348,6 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd; int i, ret = 0, timeout = 0; unsigned long start_time = jiffies; - bool keep_waiting; might_sleep(); /* @@ -1366,53 +1357,33 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) for (i = 0; i < num_gh; i++) timeout += ghs[i].gh_gl->gl_hold_time << 1; -wait_for_dlm: if (!wait_event_timeout(sdp->sd_async_glock_wait, - !glocks_pending(num_gh, ghs), timeout)) + !glocks_pending(num_gh, ghs), timeout)) { ret = -ESTALE; /* request timed out. */ + goto out; + } - /* - * If dlm granted all our requests, we need to adjust the glock - * minimum hold time values according to how long we waited. - * - * If our request timed out, we need to repeatedly release any held - * glocks we acquired thus far to allow dlm to acquire the remaining - * glocks without deadlocking. We cannot currently cancel outstanding - * glock acquisitions. - * - * The HIF_WAIT bit tells us which requests still need a response from - * dlm. - * - * If dlm sent us any errors, we return the first error we find. - */ - keep_waiting = false; for (i = 0; i < num_gh; i++) { - /* Skip holders we have already dequeued below. */ - if (!gfs2_holder_queued(&ghs[i])) - continue; - /* Skip holders with a pending DLM response. */ - if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) { - keep_waiting = true; - continue; - } + struct gfs2_holder *gh = &ghs[i]; + int ret2; - if (test_bit(HIF_HOLDER, &ghs[i].gh_iflags)) { - if (ret == -ESTALE) - gfs2_glock_dq(&ghs[i]); - else - gfs2_glock_update_hold_time(ghs[i].gh_gl, - start_time); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) { + gfs2_glock_update_hold_time(gh->gh_gl, + start_time); } + ret2 = gfs2_glock_holder_ready(gh); if (!ret) - ret = ghs[i].gh_error; + ret = ret2; } - if (keep_waiting) - goto wait_for_dlm; +out: + if (ret) { + for (i = 0; i < num_gh; i++) { + struct gfs2_holder *gh = &ghs[i]; - /* - * At this point, we've either acquired all locks or released them all. - */ + gfs2_glock_dq(gh); + } + } return ret; } @@ -1491,10 +1462,10 @@ __acquires(&gl->gl_lockref.lock) if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { if (test_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder *first_gh; + struct gfs2_holder *current_gh; - first_gh = find_first_strong_holder(gl); - try_futile = !may_grant(gl, first_gh, gh); + current_gh = find_first_strong_holder(gl); + try_futile = !may_grant(gl, current_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; @@ -2242,20 +2213,6 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) glock_hash_walk(dump_glock_func, sdp); } -void gfs2_glock_finish_truncate(struct gfs2_inode *ip) -{ - struct gfs2_glock *gl = ip->i_gl; - int ret; - - ret = gfs2_truncatei_resume(ip); - gfs2_glock_assert_withdraw(gl, ret == 0); - - spin_lock(&gl->gl_lockref.lock); - clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl, 1); - spin_unlock(&gl->gl_lockref.lock); -} - static const char *state2str(unsigned state) { switch(state) { diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c0ae9100a0bc..5aed8b500cf5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -213,6 +213,7 @@ extern void gfs2_holder_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq(struct gfs2_holder *gh); extern int gfs2_glock_poll(struct gfs2_holder *gh); extern int gfs2_instantiate(struct gfs2_holder *gh); +extern int gfs2_glock_holder_ready(struct gfs2_holder *gh); extern int gfs2_glock_wait(struct gfs2_holder *gh); extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq(struct gfs2_holder *gh); @@ -273,7 +274,6 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); -extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 392800f082a6..49210a2e7ce7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -485,35 +485,33 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) * Returns: errno */ -static int inode_go_instantiate(struct gfs2_holder *gh) +static int inode_go_instantiate(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + + if (!ip) /* no inode to populate - read it in later */ + return 0; + + return gfs2_inode_refresh(ip); +} + +static int inode_go_held(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; if (!ip) /* no inode to populate - read it in later */ - goto out; - - error = gfs2_inode_refresh(ip); - if (error) - goto out; + return 0; if (gh->gh_state != LM_ST_DEFERRED) inode_dio_wait(&ip->i_inode); if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_state == LM_ST_EXCLUSIVE)) { - spin_lock(&sdp->sd_trunc_lock); - if (list_empty(&ip->i_trunc_list)) - list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); - spin_unlock(&sdp->sd_trunc_lock); - wake_up(&sdp->sd_quota_wait); - error = 1; - } + (gh->gh_state == LM_ST_EXCLUSIVE)) + error = gfs2_truncatei_resume(ip); -out: return error; } @@ -737,6 +735,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_instantiate = inode_go_instantiate, + .go_held = inode_go_held, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 8c00fb389ae5..d09d9892cd05 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -219,7 +219,8 @@ struct gfs2_glock_operations { int (*go_xmote_bh)(struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); - int (*go_instantiate) (struct gfs2_holder *gh); + int (*go_instantiate) (struct gfs2_glock *gl); + int (*go_held)(struct gfs2_holder *gh); void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl, const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); @@ -396,7 +397,6 @@ struct gfs2_inode { atomic_t i_sizehint; /* hint of the write size */ struct rw_semaphore i_rw_mutex; struct list_head i_ordered; - struct list_head i_trunc_list; __be64 *i_hash_cache; u32 i_entries; u32 i_diskflags; @@ -784,8 +784,6 @@ struct gfs2_sbd { struct mutex sd_quota_mutex; struct mutex sd_quota_sync_mutex; wait_queue_head_t sd_quota_wait; - struct list_head sd_trunc_list; - spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; unsigned long *sd_quota_bitmap; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 244187e3e70f..d94791527dcb 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -38,7 +38,6 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); atomic_set(&ip->i_sizehint, 0); init_rwsem(&ip->i_rw_mutex); - INIT_LIST_HEAD(&ip->i_trunc_list); INIT_LIST_HEAD(&ip->i_ordered); ip->i_qadata = NULL; gfs2_holder_mark_uninitialized(&ip->i_rgd_gh); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c9b423c874a3..549879929c84 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -106,8 +106,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mutex_init(&sdp->sd_quota_mutex); mutex_init(&sdp->sd_quota_sync_mutex); init_waitqueue_head(&sdp->sd_quota_wait); - INIT_LIST_HEAD(&sdp->sd_trunc_list); - spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 59d727a4ae2c..a6667e8d781f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1517,25 +1517,6 @@ static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, } } -static void quotad_check_trunc_list(struct gfs2_sbd *sdp) -{ - struct gfs2_inode *ip; - - while(1) { - ip = NULL; - spin_lock(&sdp->sd_trunc_lock); - if (!list_empty(&sdp->sd_trunc_list)) { - ip = list_first_entry(&sdp->sd_trunc_list, - struct gfs2_inode, i_trunc_list); - list_del_init(&ip->i_trunc_list); - } - spin_unlock(&sdp->sd_trunc_lock); - if (ip == NULL) - return; - gfs2_glock_finish_truncate(ip); - } -} - void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { if (!sdp->sd_statfs_force_sync) { sdp->sd_statfs_force_sync = 1; @@ -1558,7 +1539,6 @@ int gfs2_quotad(void *data) unsigned long quotad_timeo = 0; unsigned long t = 0; DEFINE_WAIT(wait); - int empty; while (!kthread_should_stop()) { @@ -1579,19 +1559,13 @@ int gfs2_quotad(void *data) quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, "ad_timeo, &tune->gt_quota_quantum); - /* Check for & recover partially truncated inodes */ - quotad_check_trunc_list(sdp); - try_to_freeze(); bypass: t = min(quotad_timeo, statfs_timeo); prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); - spin_lock(&sdp->sd_trunc_lock); - empty = list_empty(&sdp->sd_trunc_list); - spin_unlock(&sdp->sd_trunc_lock); - if (empty && !sdp->sd_statfs_force_sync) + if (!sdp->sd_statfs_force_sync) t -= schedule_timeout(t); else t = 0; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b9b8faefbe84..f602fb844951 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1196,9 +1196,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) * Returns: errno */ -int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh) +int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl) { - struct gfs2_glock *gl = gh->gh_gl; struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int length = rgd->rd_length; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index fd6ee8bfe18d..00b30cf893af 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); extern int gfs2_rindex_update(struct gfs2_sbd *sdp); extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); -extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh); +extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bdb773e5c88f..b5b0f285b27f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1196,7 +1196,7 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) gfs2_glock_dq(gh); return false; } - return true; + return gfs2_glock_holder_ready(gh) == 0; } /** diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2de9ca5d260d..02eb72351b15 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -195,7 +195,6 @@ out: * Called under mmap_write_lock(mm). */ -#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -206,7 +205,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr); + info.high_limit = arch_get_mmap_end(addr, len, flags); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -237,21 +236,22 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr); + info.high_limit = arch_get_mmap_end(addr, len, flags); addr = vm_unmapped_area(&info); } return addr; } -static unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long +generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); - const unsigned long mmap_end = arch_get_mmap_end(addr); + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -283,6 +283,15 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); } + +#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); +} #endif static size_t @@ -405,7 +414,8 @@ static void remove_huge_page(struct page *page) } static void -hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, + zap_flags_t zap_flags) { struct vm_area_struct *vma; @@ -439,7 +449,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) } unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL); + NULL, zap_flags); } } @@ -517,7 +527,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h)); + (index + 1) * pages_per_huge_page(h), + ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); } @@ -583,46 +594,85 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_mmap_lock_write(mapping); i_size_write(inode, offset); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) - hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, + ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); } +static void hugetlbfs_zero_partial_page(struct hstate *h, + struct address_space *mapping, + loff_t start, + loff_t end) +{ + pgoff_t idx = start >> huge_page_shift(h); + struct folio *folio; + + folio = filemap_lock_folio(mapping, idx); + if (!folio) + return; + + start = start & ~huge_page_mask(h); + end = end & ~huge_page_mask(h); + if (!end) + end = huge_page_size(h); + + folio_zero_segment(folio, (size_t)start, (size_t)end); + + folio_unlock(folio); + folio_put(folio); +} + static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* - * For hole punch round up the beginning offset of the hole and - * round down the end. + * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); - if (hole_end > hole_start) { - struct address_space *mapping = inode->i_mapping; - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode_lock(inode); - inode_lock(inode); + /* protected by i_rwsem */ + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + inode_unlock(inode); + return -EPERM; + } - /* protected by i_rwsem */ - if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - inode_unlock(inode); - return -EPERM; - } + i_mmap_lock_write(mapping); - i_mmap_lock_write(mapping); + /* If range starts before first full page, zero partial page. */ + if (offset < hole_start) + hugetlbfs_zero_partial_page(h, mapping, + offset, min(offset + len, hole_start)); + + /* Unmap users of full pages in the hole. */ + if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, - hole_start >> PAGE_SHIFT, - hole_end >> PAGE_SHIFT); - i_mmap_unlock_write(mapping); - remove_inode_hugepages(inode, hole_start, hole_end); - inode_unlock(inode); + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT, 0); } + /* If range extends beyond last full page, zero partial page. */ + if ((offset + len) > hole_end && (offset + len) > hole_start) + hugetlbfs_zero_partial_page(h, mapping, + hole_end, offset + len); + + i_mmap_unlock_write(mapping); + + /* Remove full pages from the file. */ + if (hole_end > hole_start) + remove_inode_hugepages(inode, hole_start, hole_end); + + inode_unlock(inode); + return 0; } @@ -1048,12 +1098,12 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (sbinfo->spool) { long free_pages; - spin_lock(&sbinfo->spool->lock); + spin_lock_irq(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; - spin_unlock(&sbinfo->spool->lock); + spin_unlock_irq(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } diff --git a/fs/inode.c b/fs/inode.c index 9d9b422504d1..bd4da9c5207e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -27,7 +27,7 @@ * Inode locking rules: * * inode->i_lock protects: - * inode->i_state, inode->i_hash, __iget() + * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: diff --git a/fs/internal.h b/fs/internal.h index 9a6c233ee7f1..87e96b9024ce 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,6 +84,7 @@ extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write_file(struct file *); extern void dissolve_on_fput(struct vfsmount *); +extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); @@ -125,7 +126,7 @@ extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); -extern int __close_fd_get_file(unsigned int fd, struct file **res); +extern struct file *__close_fd_get_file(unsigned int fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); diff --git a/fs/io_uring.c b/fs/io_uring.c index 9f1c682d7caf..5ff2cdb425bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -112,7 +112,8 @@ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) @@ -297,8 +298,8 @@ struct io_buffer_list { /* below is for ring provided buffers */ __u16 buf_nr_pages; __u16 nr_entries; - __u32 head; - __u32 mask; + __u16 head; + __u16 mask; }; struct io_buffer { @@ -540,6 +541,7 @@ struct io_uring_task { const struct io_ring_ctx *last; struct io_wq *io_wq; struct percpu_counter inflight; + atomic_t inflight_tracked; atomic_t in_idle; spinlock_t task_lock; @@ -781,12 +783,6 @@ struct io_msg { u32 len; }; -struct io_nop { - struct file *file; - u64 extra1; - u64 extra2; -}; - struct io_async_connect { struct sockaddr_storage address; }; @@ -848,6 +844,7 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, + REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, @@ -917,6 +914,8 @@ enum { REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + /* ->extra1 and ->extra2 are initialised */ + REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), }; struct async_poll { @@ -991,7 +990,6 @@ struct io_kiocb { struct io_msg msg; struct io_xattr xattr; struct io_socket sock; - struct io_nop nop; struct io_uring_cmd uring_cmd; }; @@ -1118,7 +1116,6 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, - .buffer_select = 1, }, [IORING_OP_READV] = { .needs_file = 1, @@ -1355,8 +1352,6 @@ static void io_clean_op(struct io_kiocb *req); static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); static struct file *io_file_get_normal(struct io_kiocb *req, int fd); -static void io_drop_inflight_file(struct io_kiocb *req); -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1366,7 +1361,9 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); -static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); +static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset); +static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); @@ -1726,9 +1723,16 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; - /* don't recycle if we already did IO to this buffer */ - if (req->flags & REQ_F_PARTIAL_IO) + /* + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. + */ + if ((req->flags & REQ_F_BUFFER_SELECTED) && + (req->flags & REQ_F_PARTIAL_IO)) return; + /* * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear * the flag and hence ensure that bl->head doesn't get incremented. @@ -1736,8 +1740,13 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) */ if (req->flags & REQ_F_BUFFER_RING) { if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; + if (req->flags & REQ_F_PARTIAL_IO) { + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } } return; } @@ -1757,9 +1766,29 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) { + struct io_kiocb *req; + if (task && head->task != task) return false; - return cancel_all; + if (cancel_all) + return true; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; } /* @@ -1769,9 +1798,24 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all) { + bool matched; + if (task && head->task != task) return false; - return cancel_all; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; } static inline bool req_has_async_data(struct io_kiocb *req) @@ -1927,6 +1971,14 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } +static inline void io_req_track_inflight(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_INFLIGHT)) { + req->flags |= REQ_F_INFLIGHT; + atomic_inc(&req->task->io_uring->inflight_tracked); + } +} + static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -2395,94 +2447,66 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) { struct io_uring_cqe *cqe; - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); - return true; - } - return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); -} + if (!(ctx->flags & IORING_SETUP_CQE32)) { + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); -static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + 0, 0); + } else { + u64 extra1 = 0, extra2 = 0; - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); -} + if (req->flags & REQ_F_CQE32_INIT) { + extra1 = req->extra1; + extra2 = req->extra2; + } -static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; - u64 extra1 = req->extra1; - u64 extra2 = req->extra2; + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return true; + } - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - cqe->big_cqe[0] = extra1; - cqe->big_cqe[1] = extra2; - return true; + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + extra1, extra2); } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, - req->cqe.flags, extra1, extra2); } -static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) -{ - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); - return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); -} - -static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, - u64 extra1, u64 extra2) +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) { - struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) - return; - if (req->flags & REQ_F_CQE_SKIP) - return; - - trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, - extra1, extra2); + ctx->cq_extra++; + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); /* * If we can't get a cq entry, userspace overflowed the @@ -2491,23 +2515,17 @@ static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags */ cqe = io_get_cqe(ctx); if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, req->cqe.user_data); + WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return; - } - - io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); -} -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) -{ - ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - return __io_fill_cqe(ctx, user_data, res, cflags); + if (ctx->flags & IORING_SETUP_CQE32) { + WRITE_ONCE(cqe->big_cqe[0], 0); + WRITE_ONCE(cqe->big_cqe[1], 0); + } + return true; + } + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); } static void __io_req_complete_put(struct io_kiocb *req) @@ -2544,16 +2562,11 @@ static void __io_req_complete_put(struct io_kiocb *req) static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) { - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); - __io_req_complete_put(req); -} - -static void __io_req_complete_post32(struct io_kiocb *req, s32 res, - u32 cflags, u64 extra1, u64 extra2) -{ - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe32_req(req, res, cflags, extra1, extra2); + if (!(req->flags & REQ_F_CQE_SKIP)) { + req->cqe.res = res; + req->cqe.flags = cflags; + __io_fill_cqe_req(req->ctx, req); + } __io_req_complete_put(req); } @@ -2568,18 +2581,6 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) io_cqring_ev_posted(ctx); } -static void io_req_complete_post32(struct io_kiocb *req, s32 res, - u32 cflags, u64 extra1, u64 extra2) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_req_complete_post32(req, res, cflags, extra1, extra2); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - static inline void io_req_complete_state(struct io_kiocb *req, s32 res, u32 cflags) { @@ -2597,19 +2598,6 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } -static inline void __io_req_complete32(struct io_kiocb *req, - unsigned int issue_flags, s32 res, - u32 cflags, u64 extra1, u64 extra2) -{ - if (issue_flags & IO_URING_F_COMPLETE_DEFER) { - io_req_complete_state(req, res, cflags); - req->extra1 = extra1; - req->extra2 = extra2; - } else { - io_req_complete_post32(req, res, cflags, extra1, extra2); - } -} - static inline void io_req_complete(struct io_kiocb *req, s32 res) { if (res < 0) @@ -2988,8 +2976,6 @@ static void __io_req_task_work_add(struct io_kiocb *req, unsigned long flags; bool running; - io_drop_inflight_file(req); - spin_lock_irqsave(&tctx->task_lock, flags); wq_list_add_tail(&req->io_task_work.node, list); running = tctx->task_running; @@ -3158,12 +3144,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!(ctx->flags & IORING_SETUP_CQE32)) - __io_fill_cqe_req_filled(ctx, req); - else - __io_fill_cqe32_req_filled(ctx, req); - } + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(ctx, req); } io_commit_cqring(ctx); @@ -3282,7 +3264,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) nr_events++; if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); + + req->cqe.flags = io_put_kbuf(req, 0); + __io_fill_cqe_req(req->ctx, req); } if (unlikely(!nr_events)) @@ -3453,7 +3437,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) if (unlikely(res != req->cqe.res)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE; + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; return true; } req_set_fail(req); @@ -3503,7 +3487,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) kiocb_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE; + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; return; } req->cqe.res = res; @@ -3633,6 +3617,20 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; kiocb->ki_pos = READ_ONCE(sqe->off); + /* used for fixed read/write too - just read unconditionally */ + req->buf_index = READ_ONCE(sqe->buf_index); + + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) { + struct io_ring_ctx *ctx = req->ctx; + u16 index; + + if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); + req->imu = ctx->user_bufs[index]; + io_req_set_rsrc_node(req, ctx, 0); + } ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -3645,12 +3643,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) kiocb->ki_ioprio = get_current_ioprio(); } - req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->rw.flags = READ_ONCE(sqe->rw_flags); - /* used for fixed read/write too - just read unconditionally */ - req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3782,20 +3777,9 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, unsigned int issue_flags) { - struct io_mapped_ubuf *imu = req->imu; - u16 index, buf_index = req->buf_index; - - if (likely(!imu)) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - io_req_set_rsrc_node(req, ctx, issue_flags); - index = array_index_nospec(buf_index, ctx->nr_user_bufs); - imu = READ_ONCE(ctx->user_bufs[index]); - req->imu = imu; - } - return __io_import_fixed(req, rw, iter, imu); + if (WARN_ON_ONCE(!req->imu)) + return -EFAULT; + return __io_import_fixed(req, rw, iter, req->imu); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -3832,19 +3816,17 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, { struct io_uring_buf_ring *br = bl->buf_ring; struct io_uring_buf *buf; - __u32 head = bl->head; + __u16 head = bl->head; - if (unlikely(smp_load_acquire(&br->tail) == head)) { - io_ring_submit_unlock(req->ctx, issue_flags); + if (unlikely(smp_load_acquire(&br->tail) == head)) return NULL; - } head &= bl->mask; if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { buf = &br->bufs[head]; } else { int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; buf = page_address(bl->buf_pages[index]); buf += off; } @@ -3854,7 +3836,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->buf_list = bl; req->buf_index = buf->bid; - if (issue_flags & IO_URING_F_UNLOCKED) { + if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { /* * If we came in unlocked, we have no choice but to consume the * buffer here. This does mean it'll be pinned until the IO @@ -4176,6 +4158,16 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) return 0; } +static int io_readv_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, READ); +} + +static int io_writev_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, WRITE); +} + /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -5025,10 +5017,18 @@ void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, req->uring_cmd.task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; - io_req_task_prio_work_add(req); + io_req_task_work_add(req); } EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); +static inline void io_req_set_cqe32_extra(struct io_kiocb *req, + u64 extra1, u64 extra2) +{ + req->extra1 = extra1; + req->extra2 = extra2; + req->flags |= REQ_F_CQE32_INIT; +} + /* * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. @@ -5039,10 +5039,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) if (ret < 0) req_set_fail(req); + if (req->ctx->flags & IORING_SETUP_CQE32) - __io_req_complete32(req, 0, ret, 0, res2, 0); - else - io_req_complete(req, ret); + io_req_set_cqe32_extra(req, res2, 0); + io_req_complete(req, ret); } EXPORT_SYMBOL_GPL(io_uring_cmd_done); @@ -5103,42 +5103,6 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_shutdown_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_NET) - if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->shutdown.how = READ_ONCE(sqe->len); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_NET) - struct socket *sock; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = __sys_shutdown_sock(sock, req->shutdown.how); - io_req_complete(req, ret); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5240,14 +5204,6 @@ done: static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - /* - * If the ring is setup with CQE32, relay back addr/addr - */ - if (req->ctx->flags & IORING_SETUP_CQE32) { - req->nop.extra1 = READ_ONCE(sqe->addr); - req->nop.extra2 = READ_ONCE(sqe->addr2); - } - return 0; } @@ -5256,23 +5212,7 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - unsigned int cflags; - void __user *buf; - - if (req->flags & REQ_F_BUFFER_SELECT) { - size_t len = 1; - - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - } - - cflags = io_put_kbuf(req, issue_flags); - if (!(req->ctx->flags & IORING_SETUP_CQE32)) - __io_req_complete(req, issue_flags, 0, cflags); - else - __io_req_complete32(req, issue_flags, 0, cflags, - req->nop.extra1, req->nop.extra2); + __io_req_complete(req, issue_flags, 0, 0); return 0; } @@ -5445,15 +5385,11 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) unsigned long nr = ctx->nr_user_files; int ret; - if (table->alloc_hint >= nr) - table->alloc_hint = 0; - do { ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); - if (ret != nr) { - table->alloc_hint = ret + 1; + if (ret != nr) return ret; - } + if (!table->alloc_hint) break; @@ -5464,6 +5400,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) return -ENFILE; } +/* + * Note when io_fixed_fd_install() returns error value, it will ensure + * fput() is called correspondingly. + */ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot) { @@ -5471,26 +5411,24 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct io_ring_ctx *ctx = req->ctx; int ret; + io_ring_submit_lock(ctx, issue_flags); + if (alloc_slot) { - io_ring_submit_lock(ctx, issue_flags); ret = io_file_bitmap_get(ctx); - if (unlikely(ret < 0)) { - io_ring_submit_unlock(ctx, issue_flags); - return ret; - } - + if (unlikely(ret < 0)) + goto err; file_slot = ret; } else { file_slot--; } ret = io_install_fixed_file(req, file, issue_flags, file_slot); - if (alloc_slot) { - io_ring_submit_unlock(ctx, issue_flags); - if (!ret) - return file_slot; - } - + if (!ret && alloc_slot) + ret = file_slot; +err: + io_ring_submit_unlock(ctx, issue_flags); + if (unlikely(ret < 0)) + fput(file); return ret; } @@ -5990,7 +5928,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) struct files_struct *files = current->files; struct io_close *close = &req->close; struct fdtable *fdt; - struct file *file = NULL; + struct file *file; int ret = -EBADF; if (req->close.file_slot) { @@ -6008,7 +5946,6 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) lockdep_is_held(&files->file_lock)); if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); - file = NULL; goto err; } @@ -6018,21 +5955,16 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - ret = __close_fd_get_file(close->fd, &file); + file = __close_fd_get_file(close->fd); spin_unlock(&files->file_lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = -EBADF; + if (!file) goto err; - } /* No ->flush() or already async, safely close from here */ ret = filp_close(file, current->files); err: if (ret < 0) req_set_fail(req); - if (file) - fput(file); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -6063,6 +5995,34 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) } #if defined(CONFIG_NET) +static int io_shutdown_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + req->shutdown.how = READ_ONCE(sqe->len); + return 0; +} + +static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) +{ + struct socket *sock; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_shutdown_sock(sock, req->shutdown.how); + io_req_complete(req, ret); + return 0; +} + static bool io_net_retry(struct socket *sock, int flags) { if (!(flags & MSG_WAITALL)) @@ -6117,8 +6077,6 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->file_index)) return -EINVAL; - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); @@ -6355,8 +6313,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->file_index)) return -EINVAL; - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); @@ -6674,8 +6630,8 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags) fd_install(fd, file); ret = fd; } else { - ret = io_install_fixed_file(req, file, issue_flags, - sock->file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + sock->file_slot); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -6767,6 +6723,7 @@ IO_NETOP_PREP_ASYNC(recvmsg); IO_NETOP_PREP_ASYNC(connect); IO_NETOP_PREP(accept); IO_NETOP_PREP(socket); +IO_NETOP_PREP(shutdown); IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ @@ -6905,10 +6862,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; - unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; - - if (unlikely(!io_assign_file(req, flags))) - return -EBADF; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; } @@ -6997,7 +6950,8 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) +static void __io_poll_execute(struct io_kiocb *req, int mask, + __poll_t __maybe_unused events) { req->cqe.res = mask; /* @@ -7006,7 +6960,6 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) * CPU. We want to avoid pulling in req->apoll->events for that * case. */ - req->apoll_events = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else @@ -7157,6 +7110,8 @@ static int __io_arm_poll_handler(struct io_kiocb *req, io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; + req->apoll_events = poll->events; + ipt->pt._key = mask; ipt->req = req; ipt->error = 0; @@ -7187,8 +7142,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask) { /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) + if (unlikely(ipt->error || !ipt->nr_entries)) { poll->events |= EPOLLONESHOT; + req->apoll_events |= EPOLLONESHOT; + ipt->error = 0; + } __io_poll_execute(req, mask, poll->events); return 0; } @@ -7250,6 +7208,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) mask |= EPOLLEXCLUSIVE; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; + kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED) && !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, @@ -7390,7 +7349,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); } -static int io_poll_update_prep(struct io_kiocb *req, +static int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_update *upd = &req->poll_update; @@ -7435,7 +7394,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); + poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -7448,13 +7407,15 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + if (!ret && ipt.error) + req_set_fail(req); ret = ret ?: ipt.error; if (ret) __io_req_complete(req, issue_flags, ret, 0); return 0; } -static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) +static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; struct io_ring_ctx *ctx = req->ctx; @@ -7698,8 +7659,9 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool is_timeout_link) +static int __io_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool is_timeout_link) { struct io_timeout_data *data; unsigned flags; @@ -7754,6 +7716,18 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, false); +} + +static int io_link_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, true); +} + static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -7970,7 +7944,7 @@ done: return 0; } -static int io_rsrc_update_prep(struct io_kiocb *req, +static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) @@ -7986,6 +7960,41 @@ static int io_rsrc_update_prep(struct io_kiocb *req, return 0; } +static int io_files_update_with_index_alloc(struct io_kiocb *req, + unsigned int issue_flags) +{ + __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg); + unsigned int done; + struct file *file; + int ret, fd; + + for (done = 0; done < req->rsrc_update.nr_args; done++) { + if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + ret = -EFAULT; + break; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + break; + } + ret = io_fixed_fd_install(req, issue_flags, file, + IORING_FILE_INDEX_ALLOC); + if (ret < 0) + break; + if (copy_to_user(&fds[done], &ret, sizeof(ret))) { + __io_close_fixed(req, issue_flags, ret); + ret = -EFAULT; + break; + } + } + + if (done) + return done; + return ret; +} + static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -7999,10 +8008,14 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.resv = 0; up.resv2 = 0; - io_ring_submit_lock(ctx, issue_flags); - ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, issue_flags); + if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) { + ret = io_files_update_with_index_alloc(req, issue_flags); + } else { + io_ring_submit_lock(ctx, issue_flags); + ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, + &up, req->rsrc_update.nr_args); + io_ring_submit_unlock(ctx, issue_flags); + } if (ret < 0) req_set_fail(req); @@ -8025,7 +8038,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: - return io_poll_update_prep(req, sqe); + return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: return io_fsync_prep(req, sqe); case IORING_OP_SYNC_FILE_RANGE: @@ -8039,13 +8052,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_CONNECT: return io_connect_prep(req, sqe); case IORING_OP_TIMEOUT: - return io_timeout_prep(req, sqe, false); + return io_timeout_prep(req, sqe); case IORING_OP_TIMEOUT_REMOVE: return io_timeout_remove_prep(req, sqe); case IORING_OP_ASYNC_CANCEL: return io_async_cancel_prep(req, sqe); case IORING_OP_LINK_TIMEOUT: - return io_timeout_prep(req, sqe, true); + return io_link_timeout_prep(req, sqe); case IORING_OP_ACCEPT: return io_accept_prep(req, sqe); case IORING_OP_FALLOCATE: @@ -8055,7 +8068,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_CLOSE: return io_close_prep(req, sqe); case IORING_OP_FILES_UPDATE: - return io_rsrc_update_prep(req, sqe); + return io_files_update_prep(req, sqe); case IORING_OP_STATX: return io_statx_prep(req, sqe); case IORING_OP_FADVISE: @@ -8123,9 +8136,9 @@ static int io_req_prep_async(struct io_kiocb *req) switch (req->opcode) { case IORING_OP_READV: - return io_rw_prep_async(req, READ); + return io_readv_prep_async(req); case IORING_OP_WRITEV: - return io_rw_prep_async(req, WRITE); + return io_writev_prep_async(req); case IORING_OP_SENDMSG: return io_sendmsg_prep_async(req); case IORING_OP_RECVMSG: @@ -8264,6 +8277,11 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } + if (req->flags & REQ_F_INFLIGHT) { + struct io_uring_task *tctx = req->task->io_uring; + + atomic_dec(&tctx->inflight_tracked); + } if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -8288,6 +8306,7 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { + const struct io_op_def *def = &io_op_defs[req->opcode]; const struct cred *creds = NULL; int ret; @@ -8297,7 +8316,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); - if (!io_op_defs[req->opcode].audit_skip) + if (!def->audit_skip) audit_uring_entry(req->opcode); switch (req->opcode) { @@ -8321,7 +8340,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = io_poll_add(req, issue_flags); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_update(req, issue_flags); + ret = io_poll_remove(req, issue_flags); break; case IORING_OP_SYNC_FILE_RANGE: ret = io_sync_file_range(req, issue_flags); @@ -8436,7 +8455,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) break; } - if (!io_op_defs[req->opcode].audit_skip) + if (!def->audit_skip) audit_uring_exit(!ret, ret); if (creds) @@ -8569,19 +8588,6 @@ out: return file; } -/* - * Drop the file for requeue operations. Only used of req->file is the - * io_uring descriptor itself. - */ -static void io_drop_inflight_file(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_INFLIGHT)) { - fput(req->file); - req->file = NULL; - req->flags &= ~REQ_F_INFLIGHT; - } -} - static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); @@ -8590,7 +8596,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) /* we don't allow fixed io_uring files */ if (file && file->f_op == &io_uring_fops) - req->flags |= REQ_F_INFLIGHT; + io_req_track_inflight(req); return file; } @@ -8688,6 +8694,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ + io_kbuf_recycle(req, 0); io_queue_iowq(req, NULL); break; case IO_APOLL_OK: @@ -8788,6 +8795,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { + const struct io_op_def *def; unsigned int sqe_flags; int personality; u8 opcode; @@ -8805,12 +8813,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return -EINVAL; } + def = &io_op_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; if (sqe_flags & IOSQE_BUFFER_SELECT) { - if (!io_op_defs[opcode].buffer_select) + if (!def->buffer_select) return -EOPNOTSUPP; req->buf_index = READ_ONCE(sqe->buf_group); } @@ -8836,12 +8845,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } - if (!io_op_defs[opcode].ioprio && sqe->ioprio) + if (!def->ioprio && sqe->ioprio) return -EINVAL; - if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) + if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (io_op_defs[opcode].needs_file) { + if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; req->cqe.fd = READ_ONCE(sqe->fd); @@ -8850,7 +8859,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. */ - if (state->need_plug && io_op_defs[opcode].plug) { + if (state->need_plug && def->plug) { state->plug_started = true; state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); @@ -9658,8 +9667,7 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit) { WARN_ON_ONCE(test_bit(bit, table->bitmap)); __set_bit(bit, table->bitmap); - if (bit == table->alloc_hint) - table->alloc_hint++; + table->alloc_hint = bit + 1; } static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) @@ -9702,11 +9710,19 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + unsigned nr = ctx->nr_user_files; int ret; if (!ctx->file_data) return -ENXIO; + + /* + * Quiesce may unlock ->uring_lock, and while it's not held + * prevent new requests using the table. + */ + ctx->nr_user_files = 0; ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); + ctx->nr_user_files = nr; if (!ret) __io_sqe_files_unregister(ctx); return ret; @@ -10113,21 +10129,19 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) + __must_hold(&req->ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; bool needs_switch = false; struct io_fixed_file *file_slot; - int ret = -EBADF; + int ret; - io_ring_submit_lock(ctx, issue_flags); if (file->f_op == &io_uring_fops) - goto err; - ret = -ENXIO; + return -EBADF; if (!ctx->file_data) - goto err; - ret = -EINVAL; + return -ENXIO; if (slot_index >= ctx->nr_user_files) - goto err; + return -EINVAL; slot_index = array_index_nospec(slot_index, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); @@ -10158,15 +10172,14 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, issue_flags); if (ret) fput(file); return ret; } -static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset) { - unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; struct io_fixed_file *file_slot; struct file *file; @@ -10203,6 +10216,11 @@ out: return ret; } +static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + return __io_close_fixed(req, issue_flags, req->close.file_slot - 1); +} + static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) @@ -10351,6 +10369,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -10601,12 +10620,19 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { + unsigned nr = ctx->nr_user_bufs; int ret; if (!ctx->buf_data) return -ENXIO; + /* + * Quiesce may unlock ->uring_lock, and while it's not held + * prevent new requests using the table. + */ + ctx->nr_user_bufs = 0; ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); + ctx->nr_user_bufs = nr; if (!ret) __io_sqe_buffers_unregister(ctx); return ret; @@ -11046,6 +11072,7 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); __io_remove_buffers(ctx, bl, -1U); + kfree(bl); } while (!list_empty(&ctx->io_buffers_pages)) { @@ -11581,7 +11608,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) - return 0; + return atomic_read(&tctx->inflight_tracked); return percpu_counter_sum(&tctx->inflight); } @@ -11957,14 +11984,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); f.file = tctx->registered_rings[fd]; - if (unlikely(!f.file)) - return -EBADF; + f.flags = 0; } else { f = fdget(fd); - if (unlikely(!f.file)) - return -EBADF; } + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; if (unlikely(f.file->f_op != &io_uring_fops)) goto out_fput; @@ -12062,8 +12089,7 @@ iopoll_locked: out: percpu_ref_put(&ctx->refs); out_fput: - if (!(flags & IORING_ENTER_REGISTERED_RING)) - fdput(f); + fdput(f); return ret; } @@ -12913,6 +12939,10 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!is_power_of_2(reg.ring_entries)) return -EINVAL; + /* cannot disambiguate full vs empty due to head/tail size */ + if (reg.ring_entries >= 65536) + return -EINVAL; + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { int ret = io_init_bl_list(ctx); if (ret) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e49bb0938376..e9c308ae475f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2114,7 +2114,7 @@ out: /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation - * @page: to try and free + * @folio: Folio to detach data from. * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 7e9abdb89712..acd32f05b519 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -43,9 +43,9 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n", __func__, jeb->offset, jeb->offset, jeb->offset + c->sector_size); - instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL); + instr = kzalloc(sizeof(struct erase_info), GFP_KERNEL); if (!instr) { - pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); + pr_warn("kzalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); @@ -57,8 +57,6 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, return; } - memset(instr, 0, sizeof(*instr)); - instr->addr = jeb->offset; instr->len = c->sector_size; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 00a110f40e10..39cec28096a7 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -604,6 +604,7 @@ out_root: jffs2_free_raw_node_refs(c); kvfree(c->blocks); jffs2_clear_xattr_subsystem(c); + jffs2_sum_exit(c); out_inohash: kfree(c->inocache_list); out_wbuf: diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile index 285ec189ed5c..7156d2c218c7 100644 --- a/fs/jfs/Makefile +++ b/fs/jfs/Makefile @@ -13,5 +13,3 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ resize.o xattr.o ioctl.o jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o - -ccflags-y := -D_JFS_4K diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index a5dd7e53754a..259326556ada 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -224,18 +224,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock, * this as a hole */ goto unlock; -#ifdef _JFS_4K XADoffset(&xad, lblock64); XADlength(&xad, xlen); XADaddress(&xad, xaddr); -#else /* _JFS_4K */ - /* - * As long as block size = 4K, this isn't a problem. - * We should mark the whole page not ABNR, but how - * will we know to mark the other blocks BH_New? - */ - BUG(); -#endif /* _JFS_4K */ rc = extRecord(ip, &xad); if (rc) goto unlock; @@ -252,7 +243,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, /* * Allocate a new block */ -#ifdef _JFS_4K if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) goto unlock; rc = extAlloc(ip, xlen, lblock64, &xad, false); @@ -263,14 +253,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, map_bh(bh_result, ip->i_sb, addressXAD(&xad)); bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits; -#else /* _JFS_4K */ - /* - * We need to do whatever it takes to keep all but the last buffers - * in 4K pages - see jfs_write.c - */ - BUG(); -#endif /* _JFS_4K */ - unlock: /* * Release lock on inode diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index d8502f4989d9..6b838d3ae7c2 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -385,7 +385,8 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) } /* write the last buffer. */ - write_metapage(mp); + if (mp) + write_metapage(mp); IREAD_UNLOCK(ipbmap); @@ -868,74 +869,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) return (rc); } -#ifdef _NOTYET -/* - * NAME: dbAllocExact() - * - * FUNCTION: try to allocate the requested extent; - * - * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - extent address; - * nblocks - extent length; - * - * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error - */ -int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) -{ - int rc; - struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; - struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - struct dmap *dp; - s64 lblkno; - struct metapage *mp; - - IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); - - /* - * validate extent request: - * - * note: defragfs policy: - * max 64 blocks will be moved. - * allocation request size must be satisfied from a single dmap. - */ - if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { - IREAD_UNLOCK(ipbmap); - return -EINVAL; - } - - if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { - /* the free space is no longer available */ - IREAD_UNLOCK(ipbmap); - return -ENOSPC; - } - - /* read in the dmap covering the extent */ - lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); - mp = read_metapage(ipbmap, lblkno, PSIZE, 0); - if (mp == NULL) { - IREAD_UNLOCK(ipbmap); - return -EIO; - } - dp = (struct dmap *) mp->data; - - /* try to allocate the requested extent */ - rc = dbAllocNext(bmp, dp, blkno, nblocks); - - IREAD_UNLOCK(ipbmap); - - if (rc == 0) - mark_metapage_dirty(mp); - - release_metapage(mp); - - return (rc); -} -#endif /* _NOTYET */ - /* * NAME: dbReAlloc() * diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 837d42f61464..92b7c533407c 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2423,304 +2423,6 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, return 0; } -#ifdef _NOTYET -/* - * NAME: dtRelocate() - * - * FUNCTION: relocate dtpage (internal or leaf) of directory; - * This function is mainly used by defragfs utility. - */ -int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, - s64 nxaddr) -{ - int rc = 0; - struct metapage *mp, *pmp, *lmp, *rmp; - dtpage_t *p, *pp, *rp = 0, *lp= 0; - s64 bn; - int index; - struct btstack btstack; - pxd_t *pxd; - s64 oxaddr, nextbn, prevbn; - int xlen, xsize; - struct tlock *tlck; - struct dt_lock *dtlck; - struct pxd_lock *pxdlock; - s8 *stbl; - struct lv *lv; - - oxaddr = addressPXD(opxd); - xlen = lengthPXD(opxd); - - jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d", - (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr, - xlen); - - /* - * 1. get the internal parent dtpage covering - * router entry for the tartget page to be relocated; - */ - rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); - if (rc) - return rc; - - /* retrieve search result */ - DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - jfs_info("dtRelocate: parent router entry validated."); - - /* - * 2. relocate the target dtpage - */ - /* read in the target page from src extent */ - DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); - if (rc) { - /* release the pinned parent page */ - DT_PUTPAGE(pmp); - return rc; - } - - /* - * read in sibling pages if any to update sibling pointers; - */ - rmp = NULL; - if (p->header.next) { - nextbn = le64_to_cpu(p->header.next); - DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); - if (rc) { - DT_PUTPAGE(mp); - DT_PUTPAGE(pmp); - return (rc); - } - } - - lmp = NULL; - if (p->header.prev) { - prevbn = le64_to_cpu(p->header.prev); - DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); - if (rc) { - DT_PUTPAGE(mp); - DT_PUTPAGE(pmp); - if (rmp) - DT_PUTPAGE(rmp); - return (rc); - } - } - - /* at this point, all xtpages to be updated are in memory */ - - /* - * update sibling pointers of sibling dtpages if any; - */ - if (lmp) { - tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); - dtlck = (struct dt_lock *) & tlck->lock; - /* linelock header */ - ASSERT(dtlck->index == 0); - lv = & dtlck->lv[0]; - lv->offset = 0; - lv->length = 1; - dtlck->index++; - - lp->header.next = cpu_to_le64(nxaddr); - DT_PUTPAGE(lmp); - } - - if (rmp) { - tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); - dtlck = (struct dt_lock *) & tlck->lock; - /* linelock header */ - ASSERT(dtlck->index == 0); - lv = & dtlck->lv[0]; - lv->offset = 0; - lv->length = 1; - dtlck->index++; - - rp->header.prev = cpu_to_le64(nxaddr); - DT_PUTPAGE(rmp); - } - - /* - * update the target dtpage to be relocated - * - * write LOG_REDOPAGE of LOG_NEW type for dst page - * for the whole target page (logredo() will apply - * after image and update bmap for allocation of the - * dst extent), and update bmap for allocation of - * the dst extent; - */ - tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); - dtlck = (struct dt_lock *) & tlck->lock; - /* linelock header */ - ASSERT(dtlck->index == 0); - lv = & dtlck->lv[0]; - - /* update the self address in the dtpage header */ - pxd = &p->header.self; - PXDaddress(pxd, nxaddr); - - /* the dst page is the same as the src page, i.e., - * linelock for afterimage of the whole page; - */ - lv->offset = 0; - lv->length = p->header.maxslot; - dtlck->index++; - - /* update the buffer extent descriptor of the dtpage */ - xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; - - /* unpin the relocated page */ - DT_PUTPAGE(mp); - jfs_info("dtRelocate: target dtpage relocated."); - - /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec - * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec - * will also force a bmap update ). - */ - - /* - * 3. acquire maplock for the source extent to be freed; - */ - /* for dtpage relocation, write a LOG_NOREDOPAGE record - * for the source dtpage (logredo() will init NoRedoPage - * filter and will also update bmap for free of the source - * dtpage), and upadte bmap for free of the source dtpage; - */ - tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); - pxdlock = (struct pxd_lock *) & tlck->lock; - pxdlock->flag = mlckFREEPXD; - PXDaddress(&pxdlock->pxd, oxaddr); - PXDlength(&pxdlock->pxd, xlen); - pxdlock->index = 1; - - /* - * 4. update the parent router entry for relocation; - * - * acquire tlck for the parent entry covering the target dtpage; - * write LOG_REDOPAGE to apply after image only; - */ - jfs_info("dtRelocate: update parent router entry."); - tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); - dtlck = (struct dt_lock *) & tlck->lock; - lv = & dtlck->lv[dtlck->index]; - - /* update the PXD with the new address */ - stbl = DT_GETSTBL(pp); - pxd = (pxd_t *) & pp->slot[stbl[index]]; - PXDaddress(pxd, nxaddr); - lv->offset = stbl[index]; - lv->length = 1; - dtlck->index++; - - /* unpin the parent dtpage */ - DT_PUTPAGE(pmp); - - return rc; -} - -/* - * NAME: dtSearchNode() - * - * FUNCTION: Search for an dtpage containing a specified address - * This function is mainly used by defragfs utility. - * - * NOTE: Search result on stack, the found page is pinned at exit. - * The result page must be an internal dtpage. - * lmxaddr give the address of the left most page of the - * dtree level, in which the required dtpage resides. - */ -static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, - struct btstack * btstack) -{ - int rc = 0; - s64 bn; - struct metapage *mp; - dtpage_t *p; - int psize = 288; /* initial in-line directory */ - s8 *stbl; - int i; - pxd_t *pxd; - struct btframe *btsp; - - BT_CLR(btstack); /* reset stack */ - - /* - * descend tree to the level with specified leftmost page - * - * by convention, root bn = 0. - */ - for (bn = 0;;) { - /* get/pin the page to search */ - DT_GETPAGE(ip, bn, mp, psize, p, rc); - if (rc) - return rc; - - /* does the xaddr of leftmost page of the levevl - * matches levevl search key ? - */ - if (p->header.flag & BT_ROOT) { - if (lmxaddr == 0) - break; - } else if (addressPXD(&p->header.self) == lmxaddr) - break; - - /* - * descend down to leftmost child page - */ - if (p->header.flag & BT_LEAF) { - DT_PUTPAGE(mp); - return -ESTALE; - } - - /* get the leftmost entry */ - stbl = DT_GETSTBL(p); - pxd = (pxd_t *) & p->slot[stbl[0]]; - - /* get the child page block address */ - bn = addressPXD(pxd); - psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; - /* unpin the parent page */ - DT_PUTPAGE(mp); - } - - /* - * search each page at the current levevl - */ - loop: - stbl = DT_GETSTBL(p); - for (i = 0; i < p->header.nextindex; i++) { - pxd = (pxd_t *) & p->slot[stbl[i]]; - - /* found the specified router entry */ - if (addressPXD(pxd) == addressPXD(kpxd) && - lengthPXD(pxd) == lengthPXD(kpxd)) { - btsp = btstack->top; - btsp->bn = bn; - btsp->index = i; - btsp->mp = mp; - - return 0; - } - } - - /* get the right sibling page if any */ - if (p->header.next) - bn = le64_to_cpu(p->header.next); - else { - DT_PUTPAGE(mp); - return -ESTALE; - } - - /* unpin current page */ - DT_PUTPAGE(mp); - - /* get the right sibling page */ - DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - goto loop; -} -#endif /* _NOTYET */ - /* * dtRelink() * diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index bb4a342a193d..ae99a7e232ee 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -16,9 +16,6 @@ * forward references */ static int extBalloc(struct inode *, s64, s64 *, s64 *); -#ifdef _NOTYET -static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); -#endif static s64 extRoundDown(s64 nb); #define DPD(a) (printk("(a): %d\n",(a))) @@ -177,162 +174,6 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) return (0); } - -#ifdef _NOTYET -/* - * NAME: extRealloc() - * - * FUNCTION: extend the allocation of a file extent containing a - * partial back last page. - * - * PARAMETERS: - * ip - the inode of the file. - * cp - cbuf for the partial backed last page. - * xlen - request size of the resulting extent. - * xp - pointer to an xad. on successful exit, the xad - * describes the newly allocated extent. - * abnr - bool indicating whether the newly allocated extent - * should be marked as allocated but not recorded. - * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. - */ -int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) -{ - struct super_block *sb = ip->i_sb; - s64 xaddr, xlen, nxaddr, delta, xoff; - s64 ntail, nextend, ninsert; - int rc, nbperpage = JFS_SBI(sb)->nbperpage; - int xflag; - - /* This blocks if we are low on resources */ - txBeginAnon(ip->i_sb); - - mutex_lock(&JFS_IP(ip)->commit_mutex); - /* validate extent length */ - if (nxlen > MAXXLEN) - nxlen = MAXXLEN; - - /* get the extend (partial) page's disk block address and - * number of blocks. - */ - xaddr = addressXAD(xp); - xlen = lengthXAD(xp); - xoff = offsetXAD(xp); - - /* if the extend page is abnr and if the request is for - * the extent to be allocated and recorded, - * make the page allocated and recorded. - */ - if ((xp->flag & XAD_NOTRECORDED) && !abnr) { - xp->flag = 0; - if ((rc = xtUpdate(0, ip, xp))) - goto exit; - } - - /* try to allocated the request number of blocks for the - * extent. dbRealloc() first tries to satisfy the request - * by extending the allocation in place. otherwise, it will - * try to allocate a new set of blocks large enough for the - * request. in satisfying a request, dbReAlloc() may allocate - * less than what was request but will always allocate enough - * space as to satisfy the extend page. - */ - if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) - goto exit; - - /* Allocat blocks to quota. */ - rc = dquot_alloc_block(ip, nxlen); - if (rc) { - dbFree(ip, nxaddr, (s64) nxlen); - mutex_unlock(&JFS_IP(ip)->commit_mutex); - return rc; - } - - delta = nxlen - xlen; - - /* check if the extend page is not abnr but the request is abnr - * and the allocated disk space is for more than one page. if this - * is the case, there is a miss match of abnr between the extend page - * and the one or more pages following the extend page. as a result, - * two extents will have to be manipulated. the first will be that - * of the extent of the extend page and will be manipulated thru - * an xtExtend() or an xtTailgate(), depending upon whether the - * disk allocation occurred as an inplace extension. the second - * extent will be manipulated (created) through an xtInsert() and - * will be for the pages following the extend page. - */ - if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { - ntail = nbperpage; - nextend = ntail - xlen; - ninsert = nxlen - nbperpage; - - xflag = XAD_NOTRECORDED; - } else { - ntail = nxlen; - nextend = delta; - ninsert = 0; - - xflag = xp->flag; - } - - /* if we were able to extend the disk allocation in place, - * extend the extent. otherwise, move the extent to a - * new disk location. - */ - if (xaddr == nxaddr) { - /* extend the extent */ - if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { - dbFree(ip, xaddr + xlen, delta); - dquot_free_block(ip, nxlen); - goto exit; - } - } else { - /* - * move the extent to a new location: - * - * xtTailgate() accounts for relocated tail extent; - */ - if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { - dbFree(ip, nxaddr, nxlen); - dquot_free_block(ip, nxlen); - goto exit; - } - } - - - /* check if we need to also insert a new extent */ - if (ninsert) { - /* perform the insert. if it fails, free the blocks - * to be inserted and make it appear that we only did - * the xtExtend() or xtTailgate() above. - */ - xaddr = nxaddr + ntail; - if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, - &xaddr, 0)) { - dbFree(ip, xaddr, (s64) ninsert); - delta = nextend; - nxlen = ntail; - xflag = 0; - } - } - - /* set the return results */ - XADaddress(xp, nxaddr); - XADlength(xp, nxlen); - XADoffset(xp, xoff); - xp->flag = xflag; - - mark_inode_dirty(ip); -exit: - mutex_unlock(&JFS_IP(ip)->commit_mutex); - return (rc); -} -#endif /* _NOTYET */ - - /* * NAME: extHint() * @@ -423,44 +264,6 @@ int extRecord(struct inode *ip, xad_t * xp) return rc; } - -#ifdef _NOTYET -/* - * NAME: extFill() - * - * FUNCTION: allocate disk space for a file page that represents - * a file hole. - * - * PARAMETERS: - * ip - the inode of the file. - * cp - cbuf of the file page represent the hole. - * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. - */ -int extFill(struct inode *ip, xad_t * xp) -{ - int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; - s64 blkno = offsetXAD(xp) >> ip->i_blkbits; - -// assert(ISSPARSE(ip)); - - /* initialize the extent allocation hint */ - XADaddress(xp, 0); - - /* allocate an extent to fill the hole */ - if ((rc = extAlloc(ip, nbperpage, blkno, xp, false))) - return (rc); - - assert(lengthPXD(xp) == nbperpage); - - return (0); -} -#endif /* _NOTYET */ - - /* * NAME: extBalloc() * @@ -550,64 +353,6 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) return (0); } - -#ifdef _NOTYET -/* - * NAME: extBrealloc() - * - * FUNCTION: attempt to extend an extent's allocation. - * - * Initially, we will try to extend the extent's allocation - * in place. If this fails, we'll try to move the extent - * to a new set of blocks. If moving the extent, we initially - * will try to allocate disk blocks for the requested size - * (newnblks). if this fails (new contiguous free blocks not - * available), we'll try to allocate a smaller number of - * blocks (producing a smaller extent), with this smaller - * number of blocks consisting of the requested number of - * blocks rounded down to the next smaller power of 2 - * number (i.e. 16 -> 8). We'll continue to round down and - * retry the allocation until the number of blocks to allocate - * is smaller than the number of blocks per page. - * - * PARAMETERS: - * ip - the inode of the file. - * blkno - starting block number of the extents current allocation. - * nblks - number of blocks within the extents current allocation. - * newnblks - pointer to a s64 value. on entry, this value is the - * new desired extent size (number of blocks). on - * successful exit, this value is set to the extent's actual - * new size (new number of blocks). - * newblkno - the starting block number of the extents new allocation. - * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. - */ -static int -extBrealloc(struct inode *ip, - s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) -{ - int rc; - - /* try to extend in place */ - if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { - *newblkno = blkno; - return (0); - } else { - if (rc != -ENOSPC) - return (rc); - } - - /* in place extension not possible. - * try to move the extent to a new set of blocks. - */ - return (extBalloc(ip, blkno, newnblks, newblkno)); -} -#endif /* _NOTYET */ - - /* * NAME: extRoundDown() * diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 997c81fcea34..695415cbfe98 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -388,14 +388,6 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; linelock = (struct linelock *) & tlck->lock; } -#ifdef _JFS_WIP - else if (tlck->flag & tlckINLINELOCK) { - - inlinelock = (struct inlinelock *) & tlck; - p = (caddr_t) & inlinelock->pxd; - linelock = (struct linelock *) & tlck; - } -#endif /* _JFS_WIP */ else { jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); return 0; /* Probably should trap */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index aa4ff7bcaff2..48d1f70f786c 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -307,13 +307,11 @@ static int chkSuper(struct super_block *sb) } bsize = le32_to_cpu(j_sb->s_bsize); -#ifdef _JFS_4K if (bsize != PSIZE) { - jfs_err("Currently only 4K block size supported!"); + jfs_err("Only 4K block size supported!"); rc = -EINVAL; goto out; } -#endif /* _JFS_4K */ jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx", le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 042bbe6d8ac2..ffd4feece078 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1490,40 +1490,6 @@ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, tlck->flag |= tlckWRITEPAGE; } else jfs_err("diLog: UFO type tlck:0x%p", tlck); -#ifdef _JFS_WIP - /* - * alloc/free external EA extent - * - * a maplock for txUpdateMap() to update bPWMAP for alloc/free - * of the extent has been formatted at txLock() time; - */ - else { - assert(tlck->type & tlckEA); - - /* log LOG_UPDATEMAP for logredo() to update bmap for - * alloc of new (and free of old) external EA extent; - */ - lrd->type = cpu_to_le16(LOG_UPDATEMAP); - pxdlock = (struct pxd_lock *) & tlck->lock; - nlock = pxdlock->index; - for (i = 0; i < nlock; i++, pxdlock++) { - if (pxdlock->flag & mlckALLOCPXD) - lrd->log.updatemap.type = - cpu_to_le16(LOG_ALLOCPXD); - else - lrd->log.updatemap.type = - cpu_to_le16(LOG_FREEPXD); - lrd->log.updatemap.nxd = cpu_to_le16(1); - lrd->log.updatemap.pxd = pxdlock->pxd; - lrd->backchain = - cpu_to_le32(lmLog(log, tblk, lrd, NULL)); - } - - /* update bmap */ - tlck->flag |= tlckUPDATEMAP; - } -#endif /* _JFS_WIP */ - return; } diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 3148e9b35f3b..2d304cee884c 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -114,17 +114,6 @@ static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, static int xtSplitRoot(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp); -#ifdef _STILL_TO_PORT -static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, - xtpage_t * fp, struct btstack * btstack); - -static int xtSearchNode(struct inode *ip, - xad_t * xad, - int *cmpp, struct btstack * btstack, int flag); - -static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); -#endif /* _STILL_TO_PORT */ - /* * xtLookup() * @@ -1493,189 +1482,6 @@ int xtExtend(tid_t tid, /* transaction id */ return rc; } -#ifdef _NOTYET -/* - * xtTailgate() - * - * function: split existing 'tail' extent - * (split offset >= start offset of tail extent), and - * relocate and extend the split tail half; - * - * note: existing extent may or may not have been committed. - * caller is responsible for pager buffer cache update, and - * working block allocation map update; - * update pmap: free old split tail extent, alloc new extent; - */ -int xtTailgate(tid_t tid, /* transaction id */ - struct inode *ip, s64 xoff, /* split/new extent offset */ - s32 xlen, /* new extent length */ - s64 xaddr, /* new extent address */ - int flag) -{ - int rc = 0; - int cmp; - struct metapage *mp; /* meta-page buffer */ - xtpage_t *p; /* base B+-tree index page */ - s64 bn; - int index, nextindex, llen, rlen; - struct btstack btstack; /* traverse stack */ - struct xtsplit split; /* split information */ - xad_t *xad; - struct tlock *tlck; - struct xtlock *xtlck = 0; - struct tlock *mtlck; - struct maplock *pxdlock; - -/* -printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", - (ulong)xoff, xlen, (ulong)xaddr); -*/ - - /* there must exist extent to be tailgated */ - if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT))) - return rc; - - /* retrieve search result */ - XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); - - if (cmp != 0) { - XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "couldn't find extent\n"); - return -EIO; - } - - /* entry found must be last entry */ - nextindex = le16_to_cpu(p->header.nextindex); - if (index != nextindex - 1) { - XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "the entry found is not the last entry\n"); - return -EIO; - } - - BT_MARK_DIRTY(mp, ip); - /* - * acquire tlock of the leaf page containing original entry - */ - if (!test_cflag(COMMIT_Nolink, ip)) { - tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); - xtlck = (struct xtlock *) & tlck->lock; - } - - /* completely replace extent ? */ - xad = &p->xad[index]; -/* -printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", - (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); -*/ - if ((llen = xoff - offsetXAD(xad)) == 0) - goto updateOld; - - /* - * partially replace extent: insert entry for new extent - */ -//insertNew: - /* - * if the leaf page is full, insert the new entry and - * propagate up the router entry for the new page from split - * - * The xtSplitUp() will insert the entry and unpin the leaf page. - */ - if (nextindex == le16_to_cpu(p->header.maxentry)) { - /* xtSpliUp() unpins leaf pages */ - split.mp = mp; - split.index = index + 1; - split.flag = XAD_NEW; - split.off = xoff; /* split offset */ - split.len = xlen; - split.addr = xaddr; - split.pxdlist = NULL; - if ((rc = xtSplitUp(tid, ip, &split, &btstack))) - return rc; - - /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - /* - * if leaf root has been split, original root has been - * copied to new child page, i.e., original entry now - * resides on the new child page; - */ - if (p->header.flag & BT_INTERNAL) { - ASSERT(p->header.nextindex == - cpu_to_le16(XTENTRYSTART + 1)); - xad = &p->xad[XTENTRYSTART]; - bn = addressXAD(xad); - XT_PUTPAGE(mp); - - /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - BT_MARK_DIRTY(mp, ip); - if (!test_cflag(COMMIT_Nolink, ip)) { - tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); - xtlck = (struct xtlock *) & tlck->lock; - } - } - } - /* - * insert the new entry into the leaf page - */ - else { - /* insert the new entry: mark the entry NEW */ - xad = &p->xad[index + 1]; - XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); - - /* advance next available entry index */ - le16_add_cpu(&p->header.nextindex, 1); - } - - /* get back old XAD */ - xad = &p->xad[index]; - - /* - * truncate/relocate old extent at split offset - */ - updateOld: - /* update dmap for old/committed/truncated extent */ - rlen = lengthXAD(xad) - llen; - if (!(xad->flag & XAD_NEW)) { - /* free from PWMAP at commit */ - if (!test_cflag(COMMIT_Nolink, ip)) { - mtlck = txMaplock(tid, ip, tlckMAP); - pxdlock = (struct maplock *) & mtlck->lock; - pxdlock->flag = mlckFREEPXD; - PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); - PXDlength(&pxdlock->pxd, rlen); - pxdlock->index = 1; - } - } else - /* free from WMAP */ - dbFree(ip, addressXAD(xad) + llen, (s64) rlen); - - if (llen) - /* truncate */ - XADlength(xad, llen); - else - /* replace */ - XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); - - if (!test_cflag(COMMIT_Nolink, ip)) { - xtlck->lwm.offset = (xtlck->lwm.offset) ? - min(index, (int)xtlck->lwm.offset) : index; - xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - - xtlck->lwm.offset; - } - - /* unpin the leaf page */ - XT_PUTPAGE(mp); - - return rc; -} -#endif /* _NOTYET */ - /* * xtUpdate() * @@ -1753,32 +1559,12 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) newindex = index + 1; nextindex = le16_to_cpu(p->header.nextindex); -#ifdef _JFS_WIP_NOCOALESCE - if (xoff < nxoff) - goto updateRight; - - /* - * replace XAD with nXAD - */ - replace: /* (nxoff == xoff) */ - if (nxlen == xlen) { - /* replace XAD with nXAD:recorded */ - *xad = *nxad; - xad->flag = xflag & ~XAD_NOTRECORDED; - - goto out; - } else /* (nxlen < xlen) */ - goto updateLeft; -#endif /* _JFS_WIP_NOCOALESCE */ - -/* #ifdef _JFS_WIP_COALESCE */ if (xoff < nxoff) goto coalesceRight; /* * coalesce with left XAD */ -//coalesceLeft: /* (xoff == nxoff) */ /* is XAD first entry of page ? */ if (index == XTENTRYSTART) goto replace; @@ -1897,7 +1683,6 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) jfs_error(ip->i_sb, "xoff >= nxoff\n"); return -EIO; } -/* #endif _JFS_WIP_COALESCE */ /* * split XAD into (lXAD, nXAD): @@ -2305,752 +2090,6 @@ int xtAppend(tid_t tid, /* transaction id */ return rc; } -#ifdef _STILL_TO_PORT - -/* - TBD for defragmentaion/reorganization - - * - * xtDelete() - * - * function: - * delete the entry with the specified key. - * - * N.B.: whole extent of the entry is assumed to be deleted. - * - * parameter: - * - * return: - * ENOENT: if the entry is not found. - * - * exception: - */ -int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) -{ - int rc = 0; - struct btstack btstack; - int cmp; - s64 bn; - struct metapage *mp; - xtpage_t *p; - int index, nextindex; - struct tlock *tlck; - struct xtlock *xtlck; - - /* - * find the matching entry; xtSearch() pins the page - */ - if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) - return rc; - - XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); - if (cmp) { - /* unpin the leaf page */ - XT_PUTPAGE(mp); - return -ENOENT; - } - - /* - * delete the entry from the leaf page - */ - nextindex = le16_to_cpu(p->header.nextindex); - le16_add_cpu(&p->header.nextindex, -1); - - /* - * if the leaf page bocome empty, free the page - */ - if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) - return (xtDeleteUp(tid, ip, mp, p, &btstack)); - - BT_MARK_DIRTY(mp, ip); - /* - * acquire a transaction lock on the leaf page; - * - * action:xad deletion; - */ - tlck = txLock(tid, ip, mp, tlckXTREE); - xtlck = (struct xtlock *) & tlck->lock; - xtlck->lwm.offset = - (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; - - /* if delete from middle, shift left/compact the remaining entries */ - if (index < nextindex - 1) - memmove(&p->xad[index], &p->xad[index + 1], - (nextindex - index - 1) * sizeof(xad_t)); - - XT_PUTPAGE(mp); - - return 0; -} - - -/* - TBD for defragmentaion/reorganization - - * - * xtDeleteUp() - * - * function: - * free empty pages as propagating deletion up the tree - * - * parameter: - * - * return: - */ -static int -xtDeleteUp(tid_t tid, struct inode *ip, - struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) -{ - int rc = 0; - struct metapage *mp; - xtpage_t *p; - int index, nextindex; - s64 xaddr; - int xlen; - struct btframe *parent; - struct tlock *tlck; - struct xtlock *xtlck; - - /* - * keep root leaf page which has become empty - */ - if (fp->header.flag & BT_ROOT) { - /* keep the root page */ - fp->header.flag &= ~BT_INTERNAL; - fp->header.flag |= BT_LEAF; - fp->header.nextindex = cpu_to_le16(XTENTRYSTART); - - /* XT_PUTPAGE(fmp); */ - - return 0; - } - - /* - * free non-root leaf page - */ - if ((rc = xtRelink(tid, ip, fp))) { - XT_PUTPAGE(fmp); - return rc; - } - - xaddr = addressPXD(&fp->header.self); - xlen = lengthPXD(&fp->header.self); - /* free the page extent */ - dbFree(ip, xaddr, (s64) xlen); - - /* free the buffer page */ - discard_metapage(fmp); - - /* - * propagate page deletion up the index tree - * - * If the delete from the parent page makes it empty, - * continue all the way up the tree. - * stop if the root page is reached (which is never deleted) or - * if the entry deletion does not empty the page. - */ - while ((parent = BT_POP(btstack)) != NULL) { - /* get/pin the parent page <sp> */ - XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - index = parent->index; - - /* delete the entry for the freed child page from parent. - */ - nextindex = le16_to_cpu(p->header.nextindex); - - /* - * the parent has the single entry being deleted: - * free the parent page which has become empty. - */ - if (nextindex == 1) { - if (p->header.flag & BT_ROOT) { - /* keep the root page */ - p->header.flag &= ~BT_INTERNAL; - p->header.flag |= BT_LEAF; - p->header.nextindex = - cpu_to_le16(XTENTRYSTART); - - /* XT_PUTPAGE(mp); */ - - break; - } else { - /* free the parent page */ - if ((rc = xtRelink(tid, ip, p))) - return rc; - - xaddr = addressPXD(&p->header.self); - /* free the page extent */ - dbFree(ip, xaddr, - (s64) JFS_SBI(ip->i_sb)->nbperpage); - - /* unpin/free the buffer page */ - discard_metapage(mp); - - /* propagate up */ - continue; - } - } - /* - * the parent has other entries remaining: - * delete the router entry from the parent page. - */ - else { - BT_MARK_DIRTY(mp, ip); - /* - * acquire a transaction lock on the leaf page; - * - * action:xad deletion; - */ - tlck = txLock(tid, ip, mp, tlckXTREE); - xtlck = (struct xtlock *) & tlck->lock; - xtlck->lwm.offset = - (xtlck->lwm.offset) ? min(index, - xtlck->lwm. - offset) : index; - - /* if delete from middle, - * shift left/compact the remaining entries in the page - */ - if (index < nextindex - 1) - memmove(&p->xad[index], &p->xad[index + 1], - (nextindex - index - - 1) << L2XTSLOTSIZE); - - le16_add_cpu(&p->header.nextindex, -1); - jfs_info("xtDeleteUp(entry): 0x%lx[%d]", - (ulong) parent->bn, index); - } - - /* unpin the parent page */ - XT_PUTPAGE(mp); - - /* exit propagation up */ - break; - } - - return 0; -} - - -/* - * NAME: xtRelocate() - * - * FUNCTION: relocate xtpage or data extent of regular file; - * This function is mainly used by defragfs utility. - * - * NOTE: This routine does not have the logic to handle - * uncommitted allocated extent. The caller should call - * txCommit() to commit all the allocation before call - * this routine. - */ -int -xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ - s64 nxaddr, /* new xaddr */ - int xtype) -{ /* extent type: XTPAGE or DATAEXT */ - int rc = 0; - struct tblock *tblk; - struct tlock *tlck; - struct xtlock *xtlck; - struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ - xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ - xad_t *xad; - pxd_t *pxd; - s64 xoff, xsize; - int xlen; - s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; - cbuf_t *cp; - s64 offset, nbytes, nbrd, pno; - int nb, npages, nblks; - s64 bn; - int cmp; - int index; - struct pxd_lock *pxdlock; - struct btstack btstack; /* traverse stack */ - - xtype = xtype & EXTENT_TYPE; - - xoff = offsetXAD(oxad); - oxaddr = addressXAD(oxad); - xlen = lengthXAD(oxad); - - /* validate extent offset */ - offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; - if (offset >= ip->i_size) - return -ESTALE; /* stale extent */ - - jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", - xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); - - /* - * 1. get and validate the parent xtpage/xad entry - * covering the source extent to be relocated; - */ - if (xtype == DATAEXT) { - /* search in leaf entry */ - rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); - if (rc) - return rc; - - /* retrieve search result */ - XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - - if (cmp) { - XT_PUTPAGE(pmp); - return -ESTALE; - } - - /* validate for exact match with a single entry */ - xad = &pp->xad[index]; - if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { - XT_PUTPAGE(pmp); - return -ESTALE; - } - } else { /* (xtype == XTPAGE) */ - - /* search in internal entry */ - rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); - if (rc) - return rc; - - /* retrieve search result */ - XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - - if (cmp) { - XT_PUTPAGE(pmp); - return -ESTALE; - } - - /* xtSearchNode() validated for exact match with a single entry - */ - xad = &pp->xad[index]; - } - jfs_info("xtRelocate: parent xad entry validated."); - - /* - * 2. relocate the extent - */ - if (xtype == DATAEXT) { - /* if the extent is allocated-but-not-recorded - * there is no real data to be moved in this extent, - */ - if (xad->flag & XAD_NOTRECORDED) - goto out; - else - /* release xtpage for cmRead()/xtLookup() */ - XT_PUTPAGE(pmp); - - /* - * cmRelocate() - * - * copy target data pages to be relocated; - * - * data extent must start at page boundary and - * multiple of page size (except the last data extent); - * read in each page of the source data extent into cbuf, - * update the cbuf extent descriptor of the page to be - * homeward bound to new dst data extent - * copy the data from the old extent to new extent. - * copy is essential for compressed files to avoid problems - * that can arise if there was a change in compression - * algorithms. - * it is a good strategy because it may disrupt cache - * policy to keep the pages in memory afterwards. - */ - offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; - assert((offset & CM_OFFSET) == 0); - nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; - pno = offset >> CM_L2BSIZE; - npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; -/* - npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - - (offset >> CM_L2BSIZE) + 1; -*/ - sxaddr = oxaddr; - dxaddr = nxaddr; - - /* process the request one cache buffer at a time */ - for (nbrd = 0; nbrd < nbytes; nbrd += nb, - offset += nb, pno++, npages--) { - /* compute page size */ - nb = min(nbytes - nbrd, CM_BSIZE); - - /* get the cache buffer of the page */ - if (rc = cmRead(ip, offset, npages, &cp)) - break; - - assert(addressPXD(&cp->cm_pxd) == sxaddr); - assert(!cp->cm_modified); - - /* bind buffer with the new extent address */ - nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; - cmSetXD(ip, cp, pno, dxaddr, nblks); - - /* release the cbuf, mark it as modified */ - cmPut(cp, true); - - dxaddr += nblks; - sxaddr += nblks; - } - - /* get back parent page */ - if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) - return rc; - - XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); - jfs_info("xtRelocate: target data extent relocated."); - } else { /* (xtype == XTPAGE) */ - - /* - * read in the target xtpage from the source extent; - */ - XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); - if (rc) { - XT_PUTPAGE(pmp); - return rc; - } - - /* - * read in sibling pages if any to update sibling pointers; - */ - rmp = NULL; - if (p->header.next) { - nextbn = le64_to_cpu(p->header.next); - XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); - if (rc) { - XT_PUTPAGE(pmp); - XT_PUTPAGE(mp); - return (rc); - } - } - - lmp = NULL; - if (p->header.prev) { - prevbn = le64_to_cpu(p->header.prev); - XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); - if (rc) { - XT_PUTPAGE(pmp); - XT_PUTPAGE(mp); - if (rmp) - XT_PUTPAGE(rmp); - return (rc); - } - } - - /* at this point, all xtpages to be updated are in memory */ - - /* - * update sibling pointers of sibling xtpages if any; - */ - if (lmp) { - BT_MARK_DIRTY(lmp, ip); - tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); - lp->header.next = cpu_to_le64(nxaddr); - XT_PUTPAGE(lmp); - } - - if (rmp) { - BT_MARK_DIRTY(rmp, ip); - tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); - rp->header.prev = cpu_to_le64(nxaddr); - XT_PUTPAGE(rmp); - } - - /* - * update the target xtpage to be relocated - * - * update the self address of the target page - * and write to destination extent; - * redo image covers the whole xtpage since it is new page - * to the destination extent; - * update of bmap for the free of source extent - * of the target xtpage itself: - * update of bmap for the allocation of destination extent - * of the target xtpage itself: - * update of bmap for the extents covered by xad entries in - * the target xtpage is not necessary since they are not - * updated; - * if not committed before this relocation, - * target page may contain XAD_NEW entries which must - * be scanned for bmap update (logredo() always - * scan xtpage REDOPAGE image for bmap update); - * if committed before this relocation (tlckRELOCATE), - * scan may be skipped by commit() and logredo(); - */ - BT_MARK_DIRTY(mp, ip); - /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ - tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); - xtlck = (struct xtlock *) & tlck->lock; - - /* update the self address in the xtpage header */ - pxd = &p->header.self; - PXDaddress(pxd, nxaddr); - - /* linelock for the after image of the whole page */ - xtlck->lwm.length = - le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; - - /* update the buffer extent descriptor of target xtpage */ - xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; - bmSetXD(mp, nxaddr, xsize); - - /* unpin the target page to new homeward bound */ - XT_PUTPAGE(mp); - jfs_info("xtRelocate: target xtpage relocated."); - } - - /* - * 3. acquire maplock for the source extent to be freed; - * - * acquire a maplock saving the src relocated extent address; - * to free of the extent at commit time; - */ - out: - /* if DATAEXT relocation, write a LOG_UPDATEMAP record for - * free PXD of the source data extent (logredo() will update - * bmap for free of source data extent), and update bmap for - * free of the source data extent; - */ - if (xtype == DATAEXT) - tlck = txMaplock(tid, ip, tlckMAP); - /* if XTPAGE relocation, write a LOG_NOREDOPAGE record - * for the source xtpage (logredo() will init NoRedoPage - * filter and will also update bmap for free of the source - * xtpage), and update bmap for free of the source xtpage; - * N.B. We use tlckMAP instead of tlkcXTREE because there - * is no buffer associated with this lock since the buffer - * has been redirected to the target location. - */ - else /* (xtype == XTPAGE) */ - tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); - - pxdlock = (struct pxd_lock *) & tlck->lock; - pxdlock->flag = mlckFREEPXD; - PXDaddress(&pxdlock->pxd, oxaddr); - PXDlength(&pxdlock->pxd, xlen); - pxdlock->index = 1; - - /* - * 4. update the parent xad entry for relocation; - * - * acquire tlck for the parent entry with XAD_NEW as entry - * update which will write LOG_REDOPAGE and update bmap for - * allocation of XAD_NEW destination extent; - */ - jfs_info("xtRelocate: update parent xad entry."); - BT_MARK_DIRTY(pmp, ip); - tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); - xtlck = (struct xtlock *) & tlck->lock; - - /* update the XAD with the new destination extent; */ - xad = &pp->xad[index]; - xad->flag |= XAD_NEW; - XADaddress(xad, nxaddr); - - xtlck->lwm.offset = min(index, xtlck->lwm.offset); - xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - - xtlck->lwm.offset; - - /* unpin the parent xtpage */ - XT_PUTPAGE(pmp); - - return rc; -} - - -/* - * xtSearchNode() - * - * function: search for the internal xad entry covering specified extent. - * This function is mainly used by defragfs utility. - * - * parameters: - * ip - file object; - * xad - extent to find; - * cmpp - comparison result: - * btstack - traverse stack; - * flag - search process flag; - * - * returns: - * btstack contains (bn, index) of search path traversed to the entry. - * *cmpp is set to result of comparison with the entry returned. - * the page containing the entry is pinned at exit. - */ -static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ - int *cmpp, struct btstack * btstack, int flag) -{ - int rc = 0; - s64 xoff, xaddr; - int xlen; - int cmp = 1; /* init for empty page */ - s64 bn; /* block number */ - struct metapage *mp; /* meta-page buffer */ - xtpage_t *p; /* page */ - int base, index, lim; - struct btframe *btsp; - s64 t64; - - BT_CLR(btstack); - - xoff = offsetXAD(xad); - xlen = lengthXAD(xad); - xaddr = addressXAD(xad); - - /* - * search down tree from root: - * - * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of - * internal page, child page Pi contains entry with k, Ki <= K < Kj. - * - * if entry with search key K is not found - * internal page search find the entry with largest key Ki - * less than K which point to the child page to search; - * leaf page search find the entry with smallest key Kj - * greater than K so that the returned index is the position of - * the entry to be shifted right for insertion of new entry. - * for empty tree, search key is greater than any key of the tree. - * - * by convention, root bn = 0. - */ - for (bn = 0;;) { - /* get/pin the page to search */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - if (p->header.flag & BT_LEAF) { - XT_PUTPAGE(mp); - return -ESTALE; - } - - lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; - - /* - * binary search with search key K on the current page - */ - for (base = XTENTRYSTART; lim; lim >>= 1) { - index = base + (lim >> 1); - - XT_CMP(cmp, xoff, &p->xad[index], t64); - if (cmp == 0) { - /* - * search hit - * - * verify for exact match; - */ - if (xaddr == addressXAD(&p->xad[index]) && - xoff == offsetXAD(&p->xad[index])) { - *cmpp = cmp; - - /* save search result */ - btsp = btstack->top; - btsp->bn = bn; - btsp->index = index; - btsp->mp = mp; - - return 0; - } - - /* descend/search its child page */ - goto next; - } - - if (cmp > 0) { - base = index + 1; - --lim; - } - } - - /* - * search miss - non-leaf page: - * - * base is the smallest index with key (Kj) greater than - * search key (K) and may be zero or maxentry index. - * if base is non-zero, decrement base by one to get the parent - * entry of the child page to search. - */ - index = base ? base - 1 : base; - - /* - * go down to child page - */ - next: - /* get the child page block number */ - bn = addressXAD(&p->xad[index]); - - /* unpin the parent page */ - XT_PUTPAGE(mp); - } -} - - -/* - * xtRelink() - * - * function: - * link around a freed page. - * - * Parameter: - * int tid, - * struct inode *ip, - * xtpage_t *p) - * - * returns: - */ -static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) -{ - int rc = 0; - struct metapage *mp; - s64 nextbn, prevbn; - struct tlock *tlck; - - nextbn = le64_to_cpu(p->header.next); - prevbn = le64_to_cpu(p->header.prev); - - /* update prev pointer of the next page */ - if (nextbn != 0) { - XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* - * acquire a transaction lock on the page; - * - * action: update prev pointer; - */ - BT_MARK_DIRTY(mp, ip); - tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); - - /* the page may already have been tlock'd */ - - p->header.prev = cpu_to_le64(prevbn); - - XT_PUTPAGE(mp); - } - - /* update next pointer of the previous page */ - if (prevbn != 0) { - XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* - * acquire a transaction lock on the page; - * - * action: update next pointer; - */ - BT_MARK_DIRTY(mp, ip); - tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); - - /* the page may already have been tlock'd */ - - p->header.next = le64_to_cpu(nextbn); - - XT_PUTPAGE(mp); - } - - return 0; -} -#endif /* _STILL_TO_PORT */ - /* * xtInitRoot() diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 5f51be8596b3..142caafc73b1 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -95,10 +95,6 @@ extern int xtInsert(tid_t tid, struct inode *ip, int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); -#ifdef _NOTYET -extern int xtTailgate(tid_t tid, struct inode *ip, - s64 xoff, int xlen, s64 xaddr, int flag); -#endif extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e205fde7163a..6eca72cfa1f2 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -18,7 +18,15 @@ #include "kernfs-internal.h" static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ -static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +/* + * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to + * call pr_cont() while holding rename_lock. Because sometimes pr_cont() + * will perform wakeups when releasing console_sem. Holding rename_lock + * will introduce deadlock if the scheduler reads the kernfs_name in the + * wakeup path. + */ +static DEFINE_SPINLOCK(kernfs_pr_cont_lock); +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -229,12 +237,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); - kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** @@ -248,10 +256,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) unsigned long flags; int sz; - spin_lock_irqsave(&kernfs_rename_lock, flags); + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); - sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf, - sizeof(kernfs_pr_cont_buf)); + sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); if (sz < 0) { pr_cont("(error)"); goto out; @@ -265,7 +273,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) pr_cont("%s", kernfs_pr_cont_buf); out: - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** @@ -823,13 +831,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); - /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ - spin_lock_irq(&kernfs_rename_lock); + spin_lock_irq(&kernfs_pr_cont_lock); len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len >= sizeof(kernfs_pr_cont_buf)) { - spin_unlock_irq(&kernfs_rename_lock); + spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } @@ -841,7 +848,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, parent = kernfs_find_ns(parent, name, ns); } - spin_unlock_irq(&kernfs_rename_lock); + spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 88423069407c..e3abfa843879 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -33,7 +33,6 @@ static DEFINE_SPINLOCK(kernfs_open_node_lock); static DEFINE_MUTEX(kernfs_open_file_mutex); struct kernfs_open_node { - atomic_t refcnt; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ @@ -530,10 +529,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn, } on = kn->attr.open; - if (on) { - atomic_inc(&on->refcnt); + if (on) list_add_tail(&of->list, &on->files); - } spin_unlock_irq(&kernfs_open_node_lock); mutex_unlock(&kernfs_open_file_mutex); @@ -548,7 +545,6 @@ static int kernfs_get_open_node(struct kernfs_node *kn, if (!new_on) return -ENOMEM; - atomic_set(&new_on->refcnt, 0); atomic_set(&new_on->event, 1); init_waitqueue_head(&new_on->poll); INIT_LIST_HEAD(&new_on->files); @@ -556,17 +552,19 @@ static int kernfs_get_open_node(struct kernfs_node *kn, } /** - * kernfs_put_open_node - put kernfs_open_node - * @kn: target kernfs_nodet + * kernfs_unlink_open_file - Unlink @of from @kn. + * + * @kn: target kernfs_node * @of: associated kernfs_open_file * - * Put @kn->attr.open and unlink @of from the files list. If - * reference count reaches zero, disassociate and free it. + * Unlink @of from list of @kn's associated open files. If list of + * associated open files becomes empty, disassociate and free + * kernfs_open_node. * * LOCKING: * None. */ -static void kernfs_put_open_node(struct kernfs_node *kn, +static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on = kn->attr.open; @@ -578,7 +576,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, if (of) list_del(&of->list); - if (atomic_dec_and_test(&on->refcnt)) + if (list_empty(&on->files)) kn->attr.open = NULL; else on = NULL; @@ -706,7 +704,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) return 0; err_put_node: - kernfs_put_open_node(kn, of); + kernfs_unlink_open_file(kn, of); err_seq_release: seq_release(inode, file); err_free: @@ -752,7 +750,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) mutex_unlock(&kernfs_open_file_mutex); } - kernfs_put_open_node(kn, of); + kernfs_unlink_open_file(kn, of); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); @@ -768,15 +766,24 @@ void kernfs_drain_open_files(struct kernfs_node *kn) if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) return; - spin_lock_irq(&kernfs_open_node_lock); - on = kn->attr.open; - if (on) - atomic_inc(&on->refcnt); - spin_unlock_irq(&kernfs_open_node_lock); - if (!on) + /* + * lockless opportunistic check is safe below because no one is adding to + * ->attr.open at this point of time. This check allows early bail out + * if ->attr.open is already NULL. kernfs_unlink_open_file makes + * ->attr.open NULL only while holding kernfs_open_file_mutex so below + * check under kernfs_open_file_mutex will ensure bailing out if + * ->attr.open became NULL while waiting for the mutex. + */ + if (!kn->attr.open) return; mutex_lock(&kernfs_open_file_mutex); + if (!kn->attr.open) { + mutex_unlock(&kernfs_open_file_mutex); + return; + } + + on = kn->attr.open; list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); @@ -789,8 +796,6 @@ void kernfs_drain_open_files(struct kernfs_node *kn) } mutex_unlock(&kernfs_open_file_mutex); - - kernfs_put_open_node(kn, NULL); } /* diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 208d2cff7bd3..e8f476c5f189 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,7 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; - conn->outstanding_credits = 1; + conn->outstanding_credits = 0; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); @@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work) return 0; } -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len) +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_read) ret = conn->transport->ops->rdma_read(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_write) ret = conn->transport->ops->rdma_write(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 7a59aacb5daa..98c1cbe45ec9 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -122,11 +122,14 @@ struct ksmbd_transport_ops { int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, unsigned int remote_key); - int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - u32 remote_key, u64 remote_offset, u32 remote_len); - int (*rdma_write)(struct ksmbd_transport *t, void *buf, - unsigned int len, u32 remote_key, u64 remote_offset, - u32 remote_len); + int (*rdma_read)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); + int (*rdma_write)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); }; struct ksmbd_transport { @@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index ebe6ca08467a..52aa0adeb951 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -104,7 +104,8 @@ struct ksmbd_startup_request { */ __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ - __u32 reserved[128]; /* Reserved room */ + __u32 smbd_max_io_size; /* smbd read write size */ + __u32 reserved[127]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 1e2076a53bed..df991107ad2c 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -20,7 +20,7 @@ * wildcard '*' and '?' * TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR * - * @string: string to compare with a pattern + * @str: string to compare with a pattern * @len: string length * @pattern: pattern string which might include wildcard '*' and '?' * @@ -152,8 +152,8 @@ out: /** * convert_to_nt_pathname() - extract and return windows path string * whose share directory prefix was removed from file path - * @filename : unix filename - * @sharepath: share path string + * @share: ksmbd_share_config pointer + * @path: path to report * * Return : windows path string or error */ @@ -250,8 +250,8 @@ char *ksmbd_extract_sharename(char *treename) /** * convert_to_unix_name() - convert windows name to unix format - * @path: name to be converted - * @tid: tree id of mathing share + * @share: ksmbd_share_config pointer + * @name: file name that is relative to share * * Return: converted name on success, otherwise NULL */ diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 4a9460153b59..f8f456377a51 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -338,7 +338,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ret = 1; } - if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + if ((u64)conn->outstanding_credits + credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", credit_charge, conn->outstanding_credits); ret = 1; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 16c803a9d996..e6f4ccc12f49 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3938,6 +3938,12 @@ int smb2_query_dir(struct ksmbd_work *work) set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); + /* + * req->OutputBufferLength is too small to contain even one entry. + * In this case, it immediately returns OutputBufferLength 0 to client. + */ + if (!d_info.out_buf_len && !d_info.num_entry) + goto no_buf_len; if (rc == 0) restart_ctx(&dir_fp->readdir_data.ctx); if (rc == -ENOSPC) @@ -3964,10 +3970,12 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->Buffer[0] = 0; inc_rfc1001_len(work->response_buf, 9); } else { +no_buf_len: ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; - d_info.data_count -= d_info.last_entry_off_align; + if (d_info.data_count >= d_info.last_entry_off_align) + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); @@ -6116,7 +6124,6 @@ out: static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, struct smb2_buffer_desc_v1 *desc, __le32 Channel, - __le16 ChannelInfoOffset, __le16 ChannelInfoLength) { unsigned int i, ch_count; @@ -6134,15 +6141,13 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, le32_to_cpu(desc[i].length)); } } - if (ch_count != 1) { - ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", - ch_count); + if (!ch_count) return -EINVAL; - } work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); + if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) + work->remote_key = le32_to_cpu(desc->token); return 0; } @@ -6150,14 +6155,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, struct smb2_read_req *req, void *data_buf, size_t length) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), + le16_to_cpu(req->ReadChannelInfoLength)); if (err) return err; @@ -6180,6 +6183,8 @@ int smb2_read(struct ksmbd_work *work) size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; int err = 0; + bool is_rdma_channel = false; + unsigned int max_read_size = conn->vals->max_read_size; WORK_BUFFERS(work, req, rsp); @@ -6191,6 +6196,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { + is_rdma_channel = true; + max_read_size = get_smbd_max_read_write_size(); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { @@ -6201,7 +6211,6 @@ int smb2_read(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->ReadChannelInfoOffset, req->ReadChannelInfoLength); if (err) goto out; @@ -6223,9 +6232,9 @@ int smb2_read(struct ksmbd_work *work) length = le32_to_cpu(req->Length); mincount = le32_to_cpu(req->MinimumCount); - if (length > conn->vals->max_read_size) { + if (length > max_read_size) { ksmbd_debug(SMB, "limiting read size to max size(%u)\n", - conn->vals->max_read_size); + max_read_size); err = -EINVAL; goto out; } @@ -6257,8 +6266,7 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n", nbytes, offset, mincount); - if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || - req->Channel == SMB2_CHANNEL_RDMA_V1) { + if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, work->aux_payload_buf, @@ -6328,23 +6336,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) length = le32_to_cpu(req->Length); id = req->VolatileFileId; - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length); if (rpc_resp) { if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { @@ -6384,21 +6387,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t offset, size_t length, bool sync) { - struct smb2_buffer_desc_v1 *desc; char *data_buf; int ret; ssize_t nbytes; - desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), + le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { kvfree(data_buf); return ret; @@ -6427,8 +6427,9 @@ int smb2_write(struct ksmbd_work *work) size_t length; ssize_t nbytes; char *data_buf; - bool writethrough = false; + bool writethrough = false, is_rdma_channel = false; int err = 0; + unsigned int max_write_size = work->conn->vals->max_write_size; WORK_BUFFERS(work, req, rsp); @@ -6437,8 +6438,17 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + offset = le64_to_cpu(req->Offset); + length = le32_to_cpu(req->Length); + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + is_rdma_channel = true; + max_write_size = get_smbd_max_read_write_size(); + length = le32_to_cpu(req->RemainingBytes); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); if (req->Length != 0 || req->DataOffset != 0 || @@ -6450,7 +6460,6 @@ int smb2_write(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->WriteChannelInfoOffset, req->WriteChannelInfoLength); if (err) goto out; @@ -6474,12 +6483,9 @@ int smb2_write(struct ksmbd_work *work) goto out; } - offset = le64_to_cpu(req->Offset); - length = le32_to_cpu(req->Length); - - if (length > work->conn->vals->max_write_size) { + if (length > max_write_size) { ksmbd_debug(SMB, "limiting write size to max size(%u)\n", - work->conn->vals->max_write_size); + max_write_size); err = -EINVAL; goto out; } @@ -6487,24 +6493,17 @@ int smb2_write(struct ksmbd_work *work) if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; - if (req->Channel != SMB2_CHANNEL_RDMA_V1 && - req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if (is_rdma_channel == false) { + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) @@ -6520,8 +6519,7 @@ int smb2_write(struct ksmbd_work *work) /* read data from the client using rdma channel, and * write the data. */ - nbytes = smb2_write_rdma_channel(work, req, fp, offset, - le32_to_cpu(req->RemainingBytes), + nbytes = smb2_write_rdma_channel(work, req, fp, offset, length, writethrough); if (nbytes < 0) { err = (int)nbytes; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 9a7e211dbf4f..7f8ab14fb8ec 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -140,8 +140,10 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) hdr = work->request_buf; if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER && - hdr->Command == SMB_COM_NEGOTIATE) + hdr->Command == SMB_COM_NEGOTIATE) { + work->conn->outstanding_credits++; return 0; + } return -EINVAL; } diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 6ecf55ea1fed..38f23bf981ac 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1261,6 +1261,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, if (!access_bits) access_bits = SET_MINIMUM_RIGHTS; + posix_acl_release(posix_acls); goto check_access_bits; } } diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 3ad6881e0f7e..7cb0eeb07c80 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -26,6 +26,7 @@ #include "mgmt/ksmbd_ida.h" #include "connection.h" #include "transport_tcp.h" +#include "transport_rdma.h" #define IPC_WAIT_TIMEOUT (2 * HZ) @@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_trans_size(req->smb2_max_trans); if (req->smb2_max_credits) init_smb2_max_credits(req->smb2_max_credits); + if (req->smbd_max_io_size) + init_smbd_max_io_size(req->smbd_max_io_size); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index e646d79554b8..d035e060c2f0 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 524224; - -static int smb_direct_max_outstanding_rw_ops = 8; +static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; static LIST_HEAD(smb_direct_device_list); static DEFINE_RWLOCK(smb_direct_device_lock); @@ -147,18 +145,18 @@ struct smb_direct_transport { atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; - atomic_t rw_avail_ops; + int max_rw_credits; + int pages_per_rw_credit; + atomic_t rw_credits; wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_avail_ops; + wait_queue_head_t wait_rw_credits; mempool_t *sendmsg_mempool; struct kmem_cache *sendmsg_cache; mempool_t *recvmsg_mempool; struct kmem_cache *recvmsg_cache; - wait_queue_head_t wait_send_payload_pending; - atomic_t send_payload_pending; wait_queue_head_t wait_send_pending; atomic_t send_pending; @@ -208,12 +206,25 @@ struct smb_direct_recvmsg { struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; + int status; struct completion *completion; + struct list_head list; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; struct scatterlist sg_list[]; }; +void init_smbd_max_io_size(unsigned int sz) +{ + sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); + smb_direct_max_read_write_size = sz; +} + +unsigned int get_smbd_max_read_write_size(void) +{ + return smb_direct_max_read_write_size; +} + static inline int get_buf_page_count(void *buf, int size) { return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - @@ -377,7 +388,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->reassembly_queue_length = 0; init_waitqueue_head(&t->wait_reassembly_queue); init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_avail_ops); + init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); spin_lock_init(&t->recvmsg_queue_lock); @@ -386,8 +397,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->empty_recvmsg_queue_lock); INIT_LIST_HEAD(&t->empty_recvmsg_queue); - init_waitqueue_head(&t->wait_send_payload_pending); - atomic_set(&t->send_payload_pending, 0); init_waitqueue_head(&t->wait_send_pending); atomic_set(&t->send_pending, 0); @@ -417,8 +426,6 @@ static void free_transport(struct smb_direct_transport *t) wake_up_interruptible(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_payload_pending, - atomic_read(&t->send_payload_pending) == 0); wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); @@ -569,6 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + t->status = SMB_DIRECT_CS_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; @@ -873,13 +881,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); } - if (sendmsg->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); /* iterate and free the list of messages in reverse. the list's head * is invalid. @@ -911,21 +914,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t, { int ret; - if (wr->num_sge > 1) - atomic_inc(&t->send_payload_pending); - else - atomic_inc(&t->send_pending); - + atomic_inc(&t->send_pending); ret = ib_post_send(t->qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (wr->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); smb_direct_disconnect_rdma_connection(t); } return ret; @@ -983,18 +977,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, } static int wait_for_credits(struct smb_direct_transport *t, - wait_queue_head_t *waitq, atomic_t *credits) + wait_queue_head_t *waitq, atomic_t *total_credits, + int needed) { int ret; do { - if (atomic_dec_return(credits) >= 0) + if (atomic_sub_return(needed, total_credits) >= 0) return 0; - atomic_inc(credits); + atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, - atomic_read(credits) > 0 || - t->status != SMB_DIRECT_CS_CONNECTED); + atomic_read(total_credits) >= needed || + t->status != SMB_DIRECT_CS_CONNECTED); if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; @@ -1015,7 +1010,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t, return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); + return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); +} + +static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +{ + return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); +} + +static int calc_rw_credits(struct smb_direct_transport *t, + char *buf, unsigned int len) +{ + return DIV_ROUND_UP(get_buf_page_count(buf, len), + t->pages_per_rw_credit); } static int smb_direct_create_header(struct smb_direct_transport *t, @@ -1086,7 +1093,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen int offset, len; int i = 0; - if (nentries < get_buf_page_count(buf, size)) + if (size <= 0 || nentries < get_buf_page_count(buf, size)) return -EINVAL; offset = offset_in_page(buf); @@ -1118,7 +1125,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, int npages; npages = get_sg_list(buf, size, sg_list, nentries); - if (npages <= 0) + if (npages < 0) return -EINVAL; return ib_dma_map_sg(device, sg_list, npages, dir); } @@ -1313,11 +1320,21 @@ done: * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_payload_pending, - atomic_read(&st->send_payload_pending) == 0); + wait_event(st->wait_send_pending, + atomic_read(&st->send_pending) == 0); return ret; } +static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, + struct smb_direct_rdma_rw_msg *msg, + enum dma_data_direction dir) +{ + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, msg->sgt.nents, dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); +} + static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { @@ -1326,19 +1343,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, struct smb_direct_transport *t = msg->t; if (wc->status != IB_WC_SUCCESS) { + msg->status = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); - smb_direct_disconnect_rdma_connection(t); + if (wc->status != IB_WC_WR_FLUSH_ERR) + smb_direct_disconnect_rdma_connection(t); } - if (atomic_inc_return(&t->rw_avail_ops) > 0) - wake_up(&t->wait_rw_avail_ops); - - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, dir); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); complete(msg->completion); - kfree(msg); } static void read_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1351,94 +1363,141 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) read_write_done(cq, wc, DMA_TO_DEVICE); } -static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, - int buf_len, u32 remote_key, u64 remote_offset, - u32 remote_len, bool is_read) +static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + void *buf, int buf_len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len, + bool is_read) { - struct smb_direct_rdma_rw_msg *msg; - int ret; + struct smb_direct_rdma_rw_msg *msg, *next_msg; + int i, ret; DECLARE_COMPLETION_ONSTACK(completion); - struct ib_send_wr *first_wr = NULL; + struct ib_send_wr *first_wr; + LIST_HEAD(msg_list); + char *desc_buf; + int credits_needed; + unsigned int desc_buf_len; + size_t total_length = 0; + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; - ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + /* calculate needed credits */ + credits_needed = 0; + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + desc_buf_len = le32_to_cpu(desc[i].length); + + credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + desc_buf += desc_buf_len; + total_length += desc_buf_len; + if (desc_buf_len == 0 || total_length > buf_len || + total_length > t->max_rdma_rw_size) + return -EINVAL; + } + + ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", + is_read ? "read" : "write", buf_len, credits_needed); + + ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) return ret; - /* TODO: mempool */ - msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + - sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); - if (!msg) { - atomic_inc(&t->rw_avail_ops); - return -ENOMEM; - } + /* build rdma_rw_ctx for each descriptor */ + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } - msg->sgt.sgl = &msg->sg_list[0]; - ret = sg_alloc_table_chained(&msg->sgt, - get_buf_page_count(buf, buf_len), - msg->sg_list, SG_CHUNK_SIZE); - if (ret) { - atomic_inc(&t->rw_avail_ops); - kfree(msg); - return -ENOMEM; - } + desc_buf_len = le32_to_cpu(desc[i].length); - ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret <= 0) { - pr_err("failed to get pages\n"); - goto err; - } + msg->t = t; + msg->cqe.done = is_read ? read_done : write_done; + msg->completion = &completion; - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, get_buf_page_count(buf, buf_len), - 0, remote_offset, remote_key, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - if (ret < 0) { - pr_err("failed to init rdma_rw_ctx: %d\n", ret); - goto err; + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + get_buf_page_count(desc_buf, desc_buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { + kfree(msg); + ret = -ENOMEM; + goto out; + } + + ret = get_sg_list(desc_buf, desc_buf_len, + msg->sgt.sgl, msg->sgt.orig_nents); + if (ret < 0) { + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, + get_buf_page_count(desc_buf, desc_buf_len), + 0, + le64_to_cpu(desc[i].offset), + le32_to_cpu(desc[i].token), + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + list_add_tail(&msg->list, &msg_list); + desc_buf += desc_buf_len; } - msg->t = t; - msg->cqe.done = is_read ? read_done : write_done; - msg->completion = &completion; - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, - &msg->cqe, NULL); + /* concatenate work requests of rdma_rw_ctxs */ + first_wr = NULL; + list_for_each_entry_reverse(msg, &msg_list, list) { + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + &msg->cqe, first_wr); + } ret = ib_post_send(t->qp, first_wr, NULL); if (ret) { - pr_err("failed to post send wr: %d\n", ret); - goto err; + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); + goto out; } + msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); wait_for_completion(&completion); - return 0; - -err: - atomic_inc(&t->rw_avail_ops); - if (first_wr) - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); + ret = msg->status; +out: + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { + list_del(&msg->list); + smb_direct_free_rdma_rw_msg(t, msg, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + atomic_add(credits_needed, &t->rw_credits); + wake_up(&t->wait_rw_credits); return ret; } -static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_write(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, false); + desc, desc_len, false); } -static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_read(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, true); + desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) @@ -1638,41 +1697,57 @@ out_err: return ret; } +static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +{ + return min_t(unsigned int, + t->cm_id->device->attrs.max_fast_reg_page_list_len, + 256); +} + static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct ib_device *device = t->cm_id->device; - int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; + int max_send_sges, max_rw_wrs, max_send_wrs; + unsigned int max_sge_per_wr, wrs_per_credit; - /* need 2 more sge. because a SMB_DIRECT header will be mapped, - * and maybe a send buffer could be not page aligned. + /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, + * SMB2 response could be mapped. */ t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2; + max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { pr_err("max_send_size %d is too large\n", t->max_send_size); return -EINVAL; } - /* - * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA - * read/writes. HCA guarantees at least max_send_sge of sges for - * a RDMA read/write work request, and if memory registration is used, - * we need reg_mr, local_inv wrs for each read/write. + /* Calculate the number of work requests for RDMA R/W. + * The maximum number of pages which can be registered + * with one Memory region can be transferred with one + * R/W credit. And at least 4 work requests for each credit + * are needed for MR registration, RDMA R/W, local & remote + * MR invalidation. */ t->max_rdma_rw_size = smb_direct_max_read_write_size; - max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); - max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, - max_pages) * 2; - max_rw_wrs *= smb_direct_max_outstanding_rw_ops; + t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); + t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, + (t->pages_per_rw_credit - 1) * + PAGE_SIZE); + + max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, + device->attrs.max_sge_rd); + max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, + max_send_sges); + wrs_per_credit = max_t(unsigned int, 4, + DIV_ROUND_UP(t->pages_per_rw_credit, + max_sge_per_wr) + 1); + max_rw_wrs = t->max_rw_credits * wrs_per_credit; max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { - pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", - smb_direct_send_credit_target, - smb_direct_max_outstanding_rw_ops); + pr_err("consider lowering send_credit_target = %d\n", + smb_direct_send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; @@ -1687,11 +1762,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { - pr_err("warning: device max_send_sge = %d too small\n", - device->attrs.max_send_sge); - return -EINVAL; - } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1707,7 +1777,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->send_credit_target = smb_direct_send_credit_target; atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); + atomic_set(&t->rw_credits, t->max_rw_credits); t->max_send_size = smb_direct_max_send_size; t->max_recv_size = smb_direct_max_receive_size; @@ -1715,12 +1785,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_send_sge = max_sge_per_wr; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = - rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * - smb_direct_max_outstanding_rw_ops; + cap->max_rdma_ctxs = t->max_rw_credits; return 0; } @@ -1813,7 +1881,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->send_cq = ib_alloc_cq(t->cm_id->device, t, - t->send_credit_target, 0, IB_POLL_WORKQUEUE); + smb_direct_send_credit_target + cap->max_rdma_ctxs, + 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->send_cq)) { pr_err("Can't create RDMA send CQ\n"); ret = PTR_ERR(t->send_cq); @@ -1822,8 +1891,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - cap->max_send_wr + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); + t->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(t->recv_cq); @@ -1852,17 +1920,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - int pages_per_mr, mr_count; - - pages_per_mr = min_t(int, pages_per_rw, - t->cm_id->device->attrs.max_fast_reg_page_list_len); - mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * - atomic_read(&t->rw_avail_ops); - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, - IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, + t->max_rw_credits, IB_MR_TYPE_MEM_REG, + t->pages_per_rw_credit, 0); if (ret) { pr_err("failed to init mr pool count %d pages %d\n", - mr_count, pages_per_mr); + t->max_rw_credits, t->pages_per_rw_credit); goto err; } } diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 5567d93a6f96..77aee4e5c9dc 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,6 +7,10 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ +#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024) +#define SMBD_MIN_IOSIZE (512 * 1024) +#define SMBD_MAX_IOSIZE (16 * 1024 * 1024) + /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; @@ -52,10 +56,14 @@ struct smb_direct_data_transfer { int ksmbd_rdma_init(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); +void init_smbd_max_io_size(unsigned int sz); +unsigned int get_smbd_max_read_write_size(void); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline int ksmbd_rdma_destroy(void) { return 0; } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } +static inline void init_smbd_max_io_size(unsigned int sz) { } +static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/locks.c b/fs/locks.c index 8c6df10cd9ed..ca28e0e50e56 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl) } EXPORT_SYMBOL_GPL(locks_release_private); +/** + * locks_owner_has_blockers - Check for blocking lock requests + * @flctx: file lock context + * @owner: lock owner + * + * Return values: + * %true: @owner has at least one blocker + * %false: @owner has no blockers + */ +bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + struct file_lock *fl; + + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner != owner) + continue; + if (!list_empty(&fl->fl_blocked_requests)) { + spin_unlock(&flctx->flc_lock); + return true; + } + } + spin_unlock(&flctx->flc_lock); + return false; +} +EXPORT_SYMBOL_GPL(locks_owner_has_blockers); + /* Free a lock which is not in use. */ void locks_free_lock(struct file_lock *fl) { @@ -874,6 +902,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock *cfl; struct file_lock_context *ctx; struct inode *inode = locks_inode(filp); + void *owner; + void (*func)(void); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -881,12 +911,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl) return; } +retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (posix_locks_conflict(fl, cfl)) { - locks_copy_conflock(fl, cfl); - goto out; + if (!posix_locks_conflict(fl, cfl)) + continue; + if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable + && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { + owner = cfl->fl_lmops->lm_mod_owner; + func = cfl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + (*func)(); + module_put(owner); + goto retry; } + locks_copy_conflock(fl, cfl); + goto out; } fl->fl_type = F_UNLCK; out: @@ -1060,6 +1101,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int error; bool added = false; LIST_HEAD(dispose); + void *owner; + void (*func)(void); ctx = locks_get_lock_context(inode, request->fl_type); if (!ctx) @@ -1078,6 +1121,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } +retry: percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* @@ -1089,6 +1133,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, list_for_each_entry(fl, &ctx->flc_posix, fl_list) { if (!posix_locks_conflict(request, fl)) continue; + if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable + && (*fl->fl_lmops->lm_lock_expirable)(fl)) { + owner = fl->fl_lmops->lm_mod_owner; + func = fl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); + (*func)(); + module_put(owner); + goto retry; + } if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; diff --git a/fs/namei.c b/fs/namei.c index 896ade8b7400..1f28d3f463c3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -730,13 +730,6 @@ static bool legitimize_links(struct nameidata *nd) static bool legitimize_root(struct nameidata *nd) { - /* - * For scoped-lookups (where nd->root has been zeroed), we need to - * restart the whole lookup from scratch -- because set_root() is wrong - * for these lookups (nd->dfd is the root, not the filesystem root). - */ - if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) - return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; @@ -798,7 +791,7 @@ out: * @seq: seq number to check @dentry against * Returns: true on success, false on failure * - * Similar to to try_to_unlazy(), but here we have the next dentry already + * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and @@ -1032,7 +1025,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1041,7 +1034,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1050,7 +1043,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1059,7 +1052,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1755,7 +1748,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, seq); - if (!try_to_unlazy(nd) != 0 || !grabbed_link) + if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) @@ -2769,7 +2762,8 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, EXPORT_SYMBOL(lookup_one); /** - * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * lookup_one_unlocked - filesystem helper to lookup single pathname component + * @mnt_userns: idmapping of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2780,14 +2774,15 @@ EXPORT_SYMBOL(lookup_one); * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ -struct dentry *lookup_one_len_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, + const char *name, struct dentry *base, + int len) { struct qstr this; int err; struct dentry *ret; - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(mnt_userns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2796,6 +2791,59 @@ struct dentry *lookup_one_len_unlocked(const char *name, ret = lookup_slow(&this, base, 0); return ret; } +EXPORT_SYMBOL(lookup_one_unlocked); + +/** + * lookup_one_positive_unlocked - filesystem helper to lookup single + * pathname component + * @mnt_userns: idmapping of the mount the lookup is performed from + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * The helper should be called without i_mutex held. + */ +struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, + const char *name, + struct dentry *base, int len) +{ + struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len); + + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_unlocked); + +/** + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * Unlike lookup_one_len, it should be called without the parent + * i_mutex held, and will take the i_mutex itself if necessary. + */ +struct dentry *lookup_one_len_unlocked(const char *name, + struct dentry *base, int len) +{ + return lookup_one_unlocked(&init_user_ns, name, base, len); +} EXPORT_SYMBOL(lookup_one_len_unlocked); /* @@ -2809,12 +2857,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - struct dentry *ret = lookup_one_len_unlocked(name, base, len); - if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { - dput(ret); - ret = ERR_PTR(-ENOENT); - } - return ret; + return lookup_one_positive_unlocked(&init_user_ns, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); diff --git a/fs/namespace.c b/fs/namespace.c index 41461f55c039..e6a7e769d25d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1760,7 +1760,7 @@ out_unlock: /* * Is the caller allowed to modify his namespace? */ -static inline bool may_mount(void) +bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 8742d22dfd2b..42f892c5712e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, void netfs_readahead(struct readahead_control *ractl) { struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host); + struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); @@ -215,7 +215,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) { struct address_space *mapping = folio_file_mapping(folio); struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(mapping->host); + struct netfs_inode *ctx = netfs_inode(mapping->host); int ret; _enter("%lx", folio_index(folio)); @@ -297,6 +297,7 @@ zero_out: /** * netfs_write_begin - Helper to prepare for writing + * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from * @pos: File position at which the write will begin @@ -326,12 +327,12 @@ zero_out: * * This is usable whether or not caching is enabled. */ -int netfs_write_begin(struct file *file, struct address_space *mapping, +int netfs_write_begin(struct netfs_inode *ctx, + struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **_folio, void **_fsdata) { struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(file_inode(file )); struct folio *folio; unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b7b0e3d18d9e..43fac1b14e40 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat) /* * Miscellaneous functions. */ -static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx) +static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cookie = ctx->cache; diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e86107b30ba4..e17cdf53f6a7 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, { static atomic_t debug_ids; struct inode *inode = file ? file_inode(file) : mapping->host; - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; int ret; @@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); - netfs_clear_subrequests(rreq, false); - if (rreq->netfs_priv) - rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_clear_subrequests(rreq, false); + if (rreq->netfs_ops->free_request) + rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); kfree(rreq); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c8520284dda7..c1eda73254e1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, rv = NFS4_OK; break; case -ENOENT: + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); /* Embrace your forgetfulness! */ rv = NFS4ERR_NOMATCHING_LAYOUT; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a8ecdd527662..0c4e8dd6aa96 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2124,6 +2124,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } goto out; } + file->f_mode |= FMODE_CAN_ODIRECT; err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 11c566d8769f..4eb2a8380a28 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -153,28 +153,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, } /** - * nfs_direct_IO - NFS address space operation for direct I/O + * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block * @iter: I/O buffer * - * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, for most direct IO, we - * shunt off direct read and write requests before the VFS gets them, - * so this method is only ever called for swap. + * Perform IO to the swap-file. This is much like direct IO. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - - /* we only support swap file calling nfs_direct_IO */ - if (!IS_SWAPFILE(inode)) - return 0; + ssize_t ret; VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter, true); - return nfs_file_direct_write(iocb, iter, true); + ret = nfs_file_direct_read(iocb, iter, true); + else + ret = nfs_file_direct_write(iocb, iter, true); + if (ret < 0) + return ret; + return 0; } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d764b3ce7905..2d72b1b7ed74 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; res = nfs_open(inode, filp); + if (res == 0) + filp->f_mode |= FMODE_CAN_ODIRECT; return res; } @@ -204,15 +206,16 @@ static int nfs_file_fsync_commit(struct file *file, int datasync) { struct inode *inode = file_inode(file); - int ret; + int ret, ret2; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret < 0) - return ret; - return file_check_and_advance_wb_err(file); + ret2 = file_check_and_advance_wb_err(file); + if (ret2 < 0) + return ret2; + return ret; } int @@ -385,11 +388,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx, mapping->host)) { - status = nfs_wb_all(mapping->host); - if (status < 0) - return status; - } + if (nfs_ctx_key_to_expire(ctx, mapping->host)) + nfs_wb_all(mapping->host); return copied; } @@ -480,6 +480,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, { unsigned long blocks; long long isize; + int ret; struct inode *inode = file_inode(file); struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; @@ -493,13 +494,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, return -EINVAL; } - *span = sis->pages; + ret = rpc_clnt_swap_activate(clnt); + if (ret) + return ret; + ret = add_swap_extent(sis, 0, sis->max, 0); + if (ret < 0) { + rpc_clnt_swap_deactivate(clnt); + return ret; + } + *span = sis->pages; if (cl->rpc_ops->enable_swap) cl->rpc_ops->enable_swap(inode); - return rpc_clnt_swap_activate(clnt); + sis->flags |= SWP_FS_OPS; + return ret; } static void nfs_swap_deactivate(struct file *file) @@ -523,7 +533,6 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .release_folio = nfs_release_folio, - .direct_IO = nfs_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, #endif @@ -532,6 +541,7 @@ const struct address_space_operations nfs_file_aops = { .error_remove_page = generic_error_remove_page, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, }; /* @@ -594,18 +604,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode, - int error) -{ - struct nfs_open_context *ctx; - - ctx = nfs_file_open_context(filp); - if (nfs_error_is_fatal_on_server(error) || - nfs_ctx_key_to_expire(ctx, inode)) - return 1; - return 0; -} - ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -633,7 +631,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) - goto out; + return result; } nfs_clear_invalid_mapping(file->f_mapping); @@ -652,6 +650,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { result = filemap_fdatawrite_range(file->f_mapping, @@ -669,17 +668,22 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) } result = generic_write_sync(iocb, written); if (result < 0) - goto out; + return result; +out: /* Return error values */ error = filemap_check_wb_err(file->f_mapping, since); - if (nfs_need_check_write(file, inode, error)) { - int err = nfs_wb_all(inode); - if (err < 0) - result = err; + switch (error) { + default: + break; + case -EDQUOT: + case -EFBIG: + case -ENOSPC: + nfs_wb_all(inode); + error = file_check_and_advance_wb_err(file); + if (error < 0) + result = error; } - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); -out: return result; out_swapfile: diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 76deddab0a8f..2b2661582bbe 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -839,7 +839,12 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (IS_ERR_OR_NULL(lseg)) + if (IS_ERR(lseg)) { + /* Fall back to MDS on recoverable errors */ + if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg))) + lseg = NULL; + goto out; + } else if (!lseg) goto out; lo = NFS_I(ino)->layout; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index f73c09a9cf0a..e861d7bae305 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -231,11 +231,10 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; struct fscache_cookie *cookie = nfs_i_fscache(inode); + loff_t i_size = i_size_read(inode); - if (fscache_cookie_valid(cookie)) { - nfs_fscache_update_auxdata(&auxdata, inode); - fscache_unuse_cookie(cookie, &auxdata, NULL); - } + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_unuse_cookie(cookie, &auxdata, &i_size); } /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7eefa16ed381..8f8cd6e2d4db 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -841,6 +841,7 @@ static inline bool nfs_error_is_fatal_on_server(int err) case 0: case -ERESTARTSYS: case -EINTR: + case -ENOMEM: return false; } return nfs_error_is_fatal(err); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 7b861e4f0533..e88f6b18445e 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -93,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_file_set_open_context(filp, ctx); nfs_fscache_open_file(inode, filp); err = 0; + filp->f_mode |= FMODE_CAN_ODIRECT; out_put_ctx: put_nfs_open_context(ctx); @@ -328,7 +329,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, char *read_name = NULL; int len, status = 0; - server = NFS_SERVER(ss_mnt->mnt_root->d_inode); + server = NFS_SB(ss_mnt->mnt_sb); if (!fattr) return ERR_PTR(-ENOMEM); @@ -346,7 +347,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out; snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); - r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr); + r_ino = nfs_fhget(ss_mnt->mnt_sb, src_fh, fattr); if (IS_ERR(r_ino)) { res = ERR_CAST(r_ino); goto out_free_name; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3680c8da510c..f2dbf904c598 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -417,6 +417,9 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (!fs_locations) goto out_free; + fs_locations->fattr = nfs_alloc_fattr(); + if (!fs_locations->fattr) + goto out_free_2; /* Get locations */ dentry = ctx->clone_data.dentry; @@ -427,14 +430,16 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0) - goto out_free_2; + goto out_free_3; err = -ENOENT; if (fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) - goto out_free_2; + goto out_free_3; err = nfs_follow_referral(fc, fs_locations); +out_free_3: + kfree(fs_locations->fattr); out_free_2: kfree(fs_locations); out_free: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a79f66432bd3..c0fdcf8c0032 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1162,7 +1162,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, { unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (server->caps & NFS_CAP_MOVEABLE) task_flags = RPC_TASK_MOVEABLE; return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -2568,7 +2568,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; kref_get(&data->kref); @@ -3098,6 +3098,10 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } if (!opendata->cancelled) nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; @@ -3733,7 +3737,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -4243,6 +4247,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, if (locations == NULL) goto out; + locations->fattr = fattr; + status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; @@ -4252,17 +4258,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, * referral. Cause us to drop into the exception handler, which * will kick off migration recovery. */ - if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ - nfs_fixup_referral_attributes(&locations->fattr); - - /* replace the lookup nfs_fattr with the locations nfs_fattr */ - memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + nfs_fixup_referral_attributes(fattr); memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -4404,7 +4407,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_flags = RPC_TASK_MOVEABLE; /* Is this is an attribute revalidation, subject to softreval? */ @@ -5768,9 +5771,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred) return 0; } -static inline int nfs4_server_supports_acls(struct nfs_server *server) +static bool nfs4_server_supports_acls(const struct nfs_server *server, + enum nfs4_acl_type type) { - return server->caps & NFS_CAP_ACLS; + switch (type) { + default: + return server->attr_bitmask[0] & FATTR4_WORD0_ACL; + case NFS4ACL_DACL: + return server->attr_bitmask[1] & FATTR4_WORD1_DACL; + case NFS4ACL_SACL: + return server->attr_bitmask[1] & FATTR4_WORD1_SACL; + } } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -5809,6 +5820,7 @@ unwind: } struct nfs4_cached_acl { + enum nfs4_acl_type type; int cached; size_t len; char data[]; @@ -5829,7 +5841,8 @@ static void nfs4_zap_acl_attr(struct inode *inode) nfs4_set_cached_acl(inode, NULL); } -static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_cached_acl *acl; @@ -5839,6 +5852,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_ acl = nfsi->nfs4_acl; if (acl == NULL) goto out; + if (acl->type != type) + goto out; if (buf == NULL) /* user is just asking for length */ goto out_len; if (acl->cached == 0) @@ -5854,7 +5869,9 @@ out: return ret; } -static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, + size_t pgbase, size_t acl_len, + enum nfs4_acl_type type) { struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; @@ -5871,6 +5888,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size goto out; acl->cached = 0; } + acl->type = type; acl->len = acl_len; out: nfs4_set_cached_acl(inode, acl); @@ -5886,14 +5904,17 @@ out: * length. The next getxattr call will then produce another round trip to * the server, this time with the input buf of the required size. */ -static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct page **pages; struct nfs_getaclargs args = { .fh = NFS_FH(inode), + .acl_type = type, .acl_len = buflen, }; struct nfs_getaclres res = { + .acl_type = type, .acl_len = buflen, }; struct rpc_message msg = { @@ -5943,7 +5964,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu ret = -ERANGE; goto out_free; } - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len, + type); if (buf) { if (res.acl_len > buflen) { ret = -ERANGE; @@ -5963,14 +5985,15 @@ out_free: return ret; } -static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { .interruptible = true, }; ssize_t ret; do { - ret = __nfs4_get_acl_uncached(inode, buf, buflen); + ret = __nfs4_get_acl_uncached(inode, buf, buflen, type); trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; @@ -5979,34 +6002,37 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl return ret; } -static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, + enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); int ret; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret < 0) return ret; if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - ret = nfs4_read_cached_acl(inode, buf, buflen); + ret = nfs4_read_cached_acl(inode, buf, buflen, type); if (ret != -ENOENT) /* -ENOENT is returned if there is no ACL or if there is an ACL * but no cached acl data, just the acl length */ return ret; - return nfs4_get_acl_uncached(inode, buf, buflen); + return nfs4_get_acl_uncached(inode, buf, buflen, type); } -static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_setaclargs arg = { - .fh = NFS_FH(inode), - .acl_pages = pages, - .acl_len = buflen, + .fh = NFS_FH(inode), + .acl_type = type, + .acl_len = buflen, + .acl_pages = pages, }; struct nfs_setaclres res; struct rpc_message msg = { @@ -6020,7 +6046,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl /* You can't remove system.nfs4_acl: */ if (buflen == 0) return -EINVAL; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -6051,12 +6077,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return ret; } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { }; int err; do { - err = __nfs4_proc_set_acl(inode, buf, buflen); + err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { /* @@ -6612,10 +6639,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -6929,10 +6959,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; - struct nfs_client *client = - NFS_SERVER(lsp->ls_state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, @@ -7203,9 +7231,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; - struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -7655,21 +7682,70 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - return nfs4_proc_set_acl(inode, buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL); +} + +#if defined(CONFIG_NFS_V4_1) +#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" + +static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL); +} + +#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" + +static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL); } +static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL); +} + +#endif + #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, @@ -7902,7 +7978,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, else bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - nfs_fattr_init(&fs_locations->fattr); + nfs_fattr_init(fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); @@ -7967,7 +8043,7 @@ static int _nfs40_proc_get_locations(struct nfs_server *server, unsigned long now = jiffies; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -8032,7 +8108,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, }; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -10391,7 +10467,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 - | NFS_CAP_LGOPEN, + | NFS_CAP_LGOPEN + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10426,7 +10503,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR - | NFS_CAP_READ_PLUS, + | NFS_CAP_READ_PLUS + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10587,6 +10665,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { .set = nfs4_xattr_set_nfs4_acl, }; +#if defined(CONFIG_NFS_V4_1) +static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = { + .name = XATTR_NAME_NFSV4_DACL, + .list = nfs4_xattr_list_nfs4_dacl, + .get = nfs4_xattr_get_nfs4_dacl, + .set = nfs4_xattr_set_nfs4_dacl, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = { + .name = XATTR_NAME_NFSV4_SACL, + .list = nfs4_xattr_list_nfs4_sacl, + .get = nfs4_xattr_get_nfs4_sacl, + .set = nfs4_xattr_set_nfs4_sacl, +}; +#endif + #ifdef CONFIG_NFS_V4_2 static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { .prefix = XATTR_USER_PREFIX, @@ -10597,6 +10691,10 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#if defined(CONFIG_NFS_V4_1) + &nfs4_xattr_nfs4_dacl_handler, + &nfs4_xattr_nfs4_sacl_handler, +#endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL &nfs4_xattr_nfs4_label_handler, #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9e1c987c81e7..2540b35ec187 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1602,7 +1602,8 @@ static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, #endif /* CONFIG_NFS_V4_2 */ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, - const struct nfs4_state_recovery_ops *ops) + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_lock_state *lock; int status; @@ -1620,7 +1621,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st list_for_each_entry(lock, &state->lock_states, ls_locks) { trace_nfs4_state_lock_reclaim(state, lock); if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + *lost_locks += 1; } spin_unlock(&state->state_lock); } @@ -1630,7 +1631,9 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_state *state; unsigned int loop = 0; @@ -1666,7 +1669,7 @@ restart: #endif /* CONFIG_NFS_V4_2 */ refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = __nfs4_reclaim_open_state(sp, state, ops); + status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks); switch (status) { default: @@ -1909,6 +1912,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct rb_node *pos; LIST_HEAD(freeme); int status = 0; + int lost_locks = 0; restart: rcu_read_lock(); @@ -1928,8 +1932,11 @@ restart: spin_unlock(&clp->cl_lock); rcu_read_unlock(); - status = nfs4_reclaim_open_state(sp, ops); + status = nfs4_reclaim_open_state(sp, ops, &lost_locks); if (status < 0) { + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); status = nfs4_recovery_handle_error(clp, status); @@ -1943,6 +1950,9 @@ restart: } rcu_read_unlock(); nfs4_free_state_owners(&freeme); + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); return 0; } @@ -2106,6 +2116,11 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred dprintk("<-- %s: no memory\n", __func__); goto out; } + locations->fattr = nfs_alloc_fattr(); + if (locations->fattr == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, @@ -2120,7 +2135,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred if (!locations->nlocations) goto out; - if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); goto out; @@ -2145,6 +2160,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred out: if (page != NULL) __free_page(page); + if (locations != NULL) + kfree(locations->fattr); kfree(locations); if (result) { pr_err("NFS: migration recovery failed (server %s)\n", diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 86a5f6516928..acfe5f4bda48 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1680,19 +1680,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); } -static void -encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg, - struct compound_hdr *hdr) +static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2]) { - __be32 *p; + switch (type) { + default: + bitmap[0] = FATTR4_WORD0_ACL; + bitmap[1] = 0; + break; + case NFS4ACL_DACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_DACL; + break; + case NFS4ACL_SACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_SACL; + } +} + +static void encode_setacl(struct xdr_stream *xdr, + const struct nfs_setaclargs *arg, + struct compound_hdr *hdr) +{ + __u32 bitmap[2]; + + nfs4_acltype_to_bitmap(arg->acl_type, bitmap); encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); encode_nfs4_stateid(xdr, &zero_stateid); - p = reserve_space(xdr, 2*4); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(FATTR4_WORD0_ACL); - p = reserve_space(xdr, 4); - *p = cpu_to_be32(arg->acl_len); + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + encode_uint32(xdr, arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len); } @@ -2587,11 +2603,11 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - const __u32 nfs4_acl_bitmap[1] = { - [0] = FATTR4_WORD0_ACL, - }; + __u32 nfs4_acl_bitmap[2]; uint32_t replen; + nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap); + encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); @@ -5386,7 +5402,7 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - struct nfs_getaclres *res) + struct nfs_getaclres *res, enum nfs4_acl_type type) { unsigned int savep; uint32_t attrlen, @@ -5404,26 +5420,39 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto out; - if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) - return -EIO; - if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { - - /* The bitmap (xdr len + bitmaps) and the attr xdr len words - * are stored with the acl data to handle the problem of - * variable length bitmaps.*/ - res->acl_data_offset = xdr_page_pos(xdr); - res->acl_len = attrlen; - - /* Check for receive buffer overflow */ - if (res->acl_len > xdr_stream_remaining(xdr) || - res->acl_len + res->acl_data_offset > xdr->buf->page_len) { - res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", - attrlen, xdr_stream_remaining(xdr)); - } - } else - status = -EOPNOTSUPP; + switch (type) { + default: + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (!(bitmap[0] & FATTR4_WORD0_ACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_DACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_DACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_SACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_SACL)) + return -EOPNOTSUPP; + } + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_page_pos(xdr); + res->acl_len = attrlen; + + /* Check for receive buffer overflow */ + if (res->acl_len > xdr_stream_remaining(xdr) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); + } out: return status; } @@ -6486,7 +6515,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, res); + status = decode_getacl(xdr, rqstp, res, res->acl_type); out: return status; @@ -7051,7 +7080,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, if (res->migration) { xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); if (status) @@ -7064,7 +7093,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9157dd19b8b4..317cedfa52bf 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -767,6 +767,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .flags = RPC_TASK_ASYNC | flags, }; + if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); dprintk("NFS: initiated pgio call " diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 856c962273c7..41a9b6b58fb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, pnfs_clear_lseg_state(lseg, lseg_list); pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) pnfs_clear_layoutreturn_waitbit(lo); @@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { - if (atomic_dec_and_test(&lo->plh_outstanding)) - wake_up_var(&lo->plh_outstanding); + if (atomic_dec_and_test(&lo->plh_outstanding) && + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) @@ -2000,6 +2002,7 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; @@ -2024,11 +2027,11 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); - lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - !atomic_read(&lo->plh_outstanding))); + lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, + TASK_KILLABLE)); if (IS_ERR(lseg)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); @@ -2128,6 +2131,7 @@ lookup_again: lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); if (!lgp) { + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, PNFS_UPDATE_LAYOUT_NOMEM); nfs_layoutget_end(lo); @@ -2150,6 +2154,12 @@ lookup_again: case -ERECALLCONFLICT: case -EAGAIN: break; + case -ENODATA: + /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ + pnfs_layout_set_fail_bit( + lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + goto out_put_layout_hdr; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); @@ -2405,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && + !pnfs_is_first_layoutget(lo)) goto out_forget; if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 07f11489e4e9..f331f067691b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -105,6 +105,7 @@ enum { NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ NFS_LAYOUT_HASHED, /* The layout visible */ + NFS_LAYOUT_DRAIN, }; enum layoutdriver_policy_flags { diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 6f325e10056c..9697cd5d2561 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -102,6 +102,10 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); + + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); @@ -344,6 +348,10 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; + if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) && + nfs_server_capable(new_dir, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f00d45cf80ef..1c706465d090 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -603,8 +603,9 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static int nfs_page_async_flush(struct page *page, + struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; int ret = 0; @@ -630,11 +631,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, /* * Remove the problematic req upon fatal errors on the server */ - if (nfs_error_is_fatal(ret)) { - if (nfs_error_is_fatal_on_server(ret)) - goto out_launder; - } else - ret = -EAGAIN; + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; + if (wbc->sync_mode == WB_SYNC_NONE) + ret = AOP_WRITEPAGE_ACTIVATE; + redirty_page_for_writepage(wbc, page); nfs_redirty_request(req); pgio->pg_error = 0; } else @@ -650,15 +651,8 @@ out_launder: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - int ret; - nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); - ret = AOP_WRITEPAGE_ACTIVATE; - } - return ret; + return nfs_page_async_flush(page, wbc, pgio); } /* @@ -681,11 +675,7 @@ static int nfs_writepage_locked(struct page *page, err = nfs_do_writepage(page, wbc, &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); - if (err < 0) - return err; - if (nfs_error_is_fatal(pgio.pg_error)) - return pgio.pg_error; - return 0; + return err; } int nfs_writepage(struct page *page, struct writeback_control *wbc) @@ -737,19 +727,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) priority = wb_priority(wbc); } - nfs_pageio_init_write(&pgio, inode, priority, false, - &nfs_async_write_completion_ops); - pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); - pgio.pg_error = 0; - nfs_pageio_complete(&pgio); + do { + nfs_pageio_init_write(&pgio, inode, priority, false, + &nfs_async_write_completion_ops); + pgio.pg_io_completion = ioc; + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, + &pgio); + pgio.pg_error = 0; + nfs_pageio_complete(&pgio); + } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); if (err < 0) goto out_err; - err = pgio.pg_error; - if (nfs_error_is_fatal(err)) - goto out_err; return 0; out_err: return err; @@ -1444,7 +1434,7 @@ static void nfs_async_write_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - if (nfs_error_is_fatal(error)) + if (nfs_error_is_fatal_on_server(error)) nfs_write_error(req, error); else nfs_redirty_request(req); @@ -1719,6 +1709,10 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; + + if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client); trace_nfs_initiate_commit(data); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2c1b027774d4..9cb2d590c036 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -119,14 +119,14 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf) struct inode *inode = nf->nf_inode; do { - mutex_lock(&nfsd_file_fsnotify_group->mark_mutex); + fsnotify_group_lock(nfsd_file_fsnotify_group); mark = fsnotify_find_mark(&inode->i_fsnotify_marks, - nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group); if (mark) { nfm = nfsd_file_mark_get(container_of(mark, struct nfsd_file_mark, nfm_mark)); - mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + fsnotify_group_unlock(nfsd_file_fsnotify_group); if (nfm) { fsnotify_put_mark(mark); break; @@ -134,8 +134,9 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf) /* Avoid soft lockup race with nfsd_file_mark_put() */ fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group); fsnotify_put_mark(mark); - } else - mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + } else { + fsnotify_group_unlock(nfsd_file_fsnotify_group); + } /* allocate a new nfm */ new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); @@ -302,15 +303,18 @@ nfsd_file_put_noref(struct nfsd_file *nf) void nfsd_file_put(struct nfsd_file *nf) { + might_sleep(); + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { nfsd_file_flush(nf); nfsd_file_put_noref(nf); - } else { + } else if (nf->nf_file) { nfsd_file_put_noref(nf); - if (nf->nf_file) - nfsd_file_schedule_laundrette(); - } + nfsd_file_schedule_laundrette(); + } else + nfsd_file_put_noref(nf); + if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) nfsd_file_gc(); } @@ -678,7 +682,8 @@ nfsd_file_cache_init(void) goto out_shrinker; } - nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops, + FSNOTIFY_GROUP_NOFS); if (IS_ERR(nfsd_file_fsnotify_group)) { pr_err("nfsd: unable to create fsnotify group: %ld\n", PTR_ERR(nfsd_file_fsnotify_group)); @@ -897,9 +902,9 @@ nfsd_file_is_cached(struct inode *inode) return ret; } -__be32 -nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf) +static __be32 +nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf, bool open) { __be32 status; struct net *net = SVC_NET(rqstp); @@ -994,10 +999,14 @@ open_file: nfsd_file_gc(); nf->nf_mark = nfsd_file_mark_find_or_create(nf); - if (nf->nf_mark) - status = nfsd_open_verified(rqstp, fhp, S_IFREG, - may_flags, &nf->nf_file); - else + if (nf->nf_mark) { + if (open) { + status = nfsd_open_verified(rqstp, fhp, may_flags, + &nf->nf_file); + trace_nfsd_file_open(nf, status); + } else + status = nfs_ok; + } else status = nfserr_jukebox; /* * If construction failed, or we raced with a call to unlink() @@ -1017,6 +1026,40 @@ open_file: goto out; } +/** + * nfsd_file_acquire - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true); +} + +/** + * nfsd_file_create - Get a struct nfsd_file, do not open + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file just created + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false); +} + /* * Note that fields may be added, removed or reordered in the future. Programs * scraping this file for info should test the labels to ensure they're diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 435ceab27897..1da0c79a5580 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -59,5 +59,7 @@ void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); int nfsd_file_cache_stats_open(struct inode *, struct file *); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 936eebd4c56d..981a3a7a6e16 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/ext2_fs.h> #include <linux/magic.h> +#include <linux/namei.h> #include "cache.h" #include "xdr3.h" @@ -220,17 +221,132 @@ nfsd3_proc_write(struct svc_rqst *rqstp) } /* - * With NFSv3, CREATE processing is a lot easier than with NFSv2. - * At least in theory; we'll see how it fares in practice when the - * first reports about SunOS compatibility problems start to pour in... + * Implement NFSv3's unchecked, guarded, and exclusive CREATE + * semantics for regular files. Except for the created file, + * this operation is stateless on the server. + * + * Upon return, caller must release @fhp and @resfhp. */ static __be32 +nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd3_createargs *argp) +{ + struct iattr *iap = &argp->attrs; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(argp->name, argp->len)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + child = lookup_one_len(argp->name, parent, argp->len); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + u32 *verifier = (u32 *)argp->verf; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (argp->createmode) { + case NFS3_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + case NFS3_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS3_CREATE_EXCLUSIVE: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + break; + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); + if (host_err < 0) { + status = nfserrno(host_err); + goto out; + } + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET | ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, iap); + +out: + fh_unlock(fhp); + if (child && !IS_ERR(child)) + dput(child); + fh_drop_write(fhp); + return status; +} + +static __be32 nfsd3_proc_create(struct svc_rqst *rqstp) { struct nfsd3_createargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - svc_fh *dirfhp, *newfhp = NULL; - struct iattr *attr; + svc_fh *dirfhp, *newfhp; dprintk("nfsd: CREATE(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -239,21 +355,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp) dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); - attr = &argp->attrs; - - /* Unfudge the mode bits */ - attr->ia_mode &= ~S_IFMT; - if (!(attr->ia_valid & ATTR_MODE)) { - attr->ia_valid |= ATTR_MODE; - attr->ia_mode = S_IFREG; - } else { - attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; - } - /* Now create the file and set attributes */ - resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, newfhp, argp->createmode, - (u32 *)argp->verf, NULL, NULL); + resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp); return rpc_success; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b207c76a873f..3895eb52d2b1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -37,6 +37,8 @@ #include <linux/falloc.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/namei.h> + #include <linux/sunrpc/addr.h> #include <linux/nfs_ssc.h> @@ -235,6 +237,183 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate &resfh->fh_handle); } +static inline bool nfsd4_create_is_exclusive(int createmode) +{ + return createmode == NFS4_CREATE_EXCLUSIVE || + createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +static __be32 +nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child, + struct nfsd4_open *open) +{ + struct file *filp; + struct path path; + int oflags; + + oflags = O_CREAT | O_LARGEFILE; + switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_WRITE: + oflags |= O_WRONLY; + break; + case NFS4_SHARE_ACCESS_BOTH: + oflags |= O_RDWR; + break; + default: + oflags |= O_RDONLY; + } + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = child; + filp = dentry_create(&path, oflags, open->op_iattr.ia_mode, + current_cred()); + if (IS_ERR(filp)) + return nfserrno(PTR_ERR(filp)); + + open->op_filp = filp; + return nfs_ok; +} + +/* + * Implement NFSv4's unchecked, guarded, and exclusive create + * semantics for regular files. Open state for this new file is + * subsequently fabricated in nfsd4_process_open2(). + * + * Upon return, caller must release @fhp and @resfhp. + */ +static __be32 +nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd4_open *open) +{ + struct iattr *iap = &open->op_iattr; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(open->op_fname, open->op_fnamelen)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + child = lookup_one_len(open->op_fname, parent, open->op_fnamelen); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + u32 *verifier = (u32 *)open->op_verf.data; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. If this + * is ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be + * fixed accordingly. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + + /* + * In NFSv4, we don't want to truncate the file + * now. This would be wrong if the OPEN fails for + * some other reason. Furthermore, if the size is + * nonzero, we should ignore it according to spec! + */ + open->op_truncate = (iap->ia_valid & ATTR_SIZE) && + !iap->ia_size; + break; + case NFS4_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + break; /* subtle */ + } + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + goto set_attr; /* subtle */ + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + status = nfsd4_vfs_create(fhp, child, open); + if (status != nfs_ok) + goto out; + open->op_created = true; + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET|ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, iap); + +out: + fh_unlock(fhp); + if (child && !IS_ERR(child)) + dput(child); + fh_drop_write(fhp); + return status; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -264,16 +443,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * yes | yes | GUARDED4 | GUARDED4 */ - /* - * Note: create modes (UNCHECKED,GUARDED...) are the same - * in NFSv4 as in v3 except EXCLUSIVE4_1. - */ current->fs->umask = open->op_umask; - status = do_nfsd_create(rqstp, current_fh, open->op_fname, - open->op_fnamelen, &open->op_iattr, - *resfh, open->op_createmode, - (u32 *)open->op_verf.data, - &open->op_truncate, &open->op_created); + status = nfsd4_create_file(rqstp, current_fh, *resfh, open); current->fs->umask = 0; if (!status && open->op_label.len) @@ -284,7 +455,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * use the returned bitmask to indicate which attributes * we used to store the verifier: */ - if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); } else @@ -375,6 +546,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (int)open->op_fnamelen, open->op_fname, open->op_openowner); + open->op_filp = NULL; + /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; @@ -427,43 +600,35 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; switch (open->op_claim_type) { - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open, &resfh); - if (status) - goto out; - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(cstate->clp); - if (status) - goto out; - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - reclaim = true; - fallthrough; - case NFS4_OPEN_CLAIM_FH: - case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - status = do_open_fhandle(rqstp, cstate, open); - if (status) - goto out; - resfh = &cstate->current_fh; - break; - case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - case NFS4_OPEN_CLAIM_DELEGATE_PREV: - dprintk("NFSD: unsupported OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_notsupp; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + status = do_open_lookup(rqstp, cstate, open, &resfh); + if (status) goto out; - default: - dprintk("NFSD: Invalid OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_inval; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + status = nfs4_check_open_reclaim(cstate->clp); + if (status) + goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + reclaim = true; + fallthrough; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + status = do_open_fhandle(rqstp, cstate, open); + if (status) goto out; + resfh = &cstate->current_fh; + break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + status = nfserr_notsupp; + goto out; + default: + status = nfserr_inval; + goto out; } - /* - * nfsd4_process_open2() does the actual opening of the file. If - * successful, it (1) truncates the file if open->op_truncate was - * set, (2) sets open->op_stateid, (3) sets open->op_delegation. - */ + status = nfsd4_process_open2(rqstp, resfh, open); WARN(status && open->op_created, "nfsd4_process_open2 failed to open newly-created file! status=%u\n", @@ -471,6 +636,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (reclaim && !status) nn->somebody_reclaimed = true; out: + if (open->op_filp) { + fput(open->op_filp); + open->op_filp = NULL; + } if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); @@ -801,7 +970,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, @@ -2481,11 +2650,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) cstate->minorversion = args->minorversion; fh_init(current_fh, NFS4_FHSIZE); fh_init(save_fh, NFS4_FHSIZE); + /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -2600,7 +2770,7 @@ encode_op: out: cstate->status = status; /* Reset deferral mechanism for RPC deferrals */ - set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 234e852fcdfa..9409a0dc1b76 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -125,6 +125,23 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static struct workqueue_struct *laundry_wq; + +int nfsd4_create_laundry_wq(void) +{ + int rc = 0; + + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); + if (laundry_wq == NULL) + rc = -ENOMEM; + return rc; +} + +void nfsd4_destroy_laundry_wq(void) +{ + destroy_workqueue(laundry_wq); +} + static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_flags & NFS4_SESSION_DEAD; @@ -152,6 +169,7 @@ static __be32 get_client_locked(struct nfs4_client *clp) if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); + clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } @@ -172,6 +190,7 @@ renew_client_locked(struct nfs4_client *clp) list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); + clp->cl_state = NFSD4_ACTIVE; } static void put_client_renew_locked(struct nfs4_client *clp) @@ -690,6 +709,57 @@ static unsigned int file_hashval(struct svc_fh *fh) static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; +/* + * Check if courtesy clients have conflicting access and resolve it if possible + * + * access: is op_share_access if share_access is true. + * Check if access mode, op_share_access, would conflict with + * the current deny mode of the file 'fp'. + * access: is op_share_deny if share_access is false. + * Check if the deny mode, op_share_deny, would conflict with + * current access of the file 'fp'. + * stp: skip checking this entry. + * new_stp: normal open, not open upgrade. + * + * Function returns: + * false - access/deny mode conflict with normal client. + * true - no conflict or conflict with courtesy client(s) is resolved. + */ +static bool +nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp, + struct nfs4_ol_stateid *stp, u32 access, bool share_access) +{ + struct nfs4_ol_stateid *st; + bool resolvable = true; + unsigned char bmap; + struct nfsd_net *nn; + struct nfs4_client *clp; + + lockdep_assert_held(&fp->fi_lock); + list_for_each_entry(st, &fp->fi_stateids, st_perfile) { + /* ignore lock stateid */ + if (st->st_openstp) + continue; + if (st == stp && new_stp) + continue; + /* check file access against deny mode or vice versa */ + bmap = share_access ? st->st_deny_bmap : st->st_access_bmap; + if (!(access & bmap_to_share_mode(bmap))) + continue; + clp = st->st_stid.sc_client; + if (try_to_expire_client(clp)) + continue; + resolvable = false; + break; + } + if (resolvable) { + clp = stp->st_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + return resolvable; +} + static void __nfs4_file_get_access(struct nfs4_file *fp, u32 access) { @@ -1090,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; + dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); get_nfs4_file(fp); @@ -2004,6 +2075,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) idr_init(&clp->cl_stateids); atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; + clp->cl_state = NFSD4_ACTIVE; + atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); @@ -2408,10 +2481,17 @@ static int client_info_show(struct seq_file *m, void *v) memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); - if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + + if (clp->cl_state == NFSD4_COURTESY) + seq_puts(m, "status: courtesy\n"); + else if (clp->cl_state == NFSD4_EXPIRABLE) + seq_puts(m, "status: expirable\n"); + else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) seq_puts(m, "status: confirmed\n"); else seq_puts(m, "status: unconfirmed\n"); + seq_printf(m, "seconds from last renew: %lld\n", + ktime_get_boottime_seconds() - clp->cl_time); seq_printf(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); @@ -4694,9 +4774,18 @@ nfsd_break_deleg_cb(struct file_lock *fl) bool ret = false; struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn; trace_nfsd_cb_recall(&dp->dl_stid); + dp->dl_recalled = true; + atomic_inc(&clp->cl_delegs_in_recall); + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned @@ -4739,9 +4828,14 @@ static int nfsd_change_deleg_cb(struct file_lock *onlist, int arg, struct list_head *dispose) { - if (arg & F_UNLCK) + struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner; + struct nfs4_client *clp = dp->dl_stid.sc_client; + + if (arg & F_UNLCK) { + if (dp->dl_recalled) + atomic_dec(&clp->cl_delegs_in_recall); return lease_modify(onlist, arg, dispose); - else + } else return -EAGAIN; } @@ -4947,7 +5041,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, - struct nfsd4_open *open) + struct nfsd4_open *open, bool new_stp) { struct nfsd_file *nf = NULL; __be32 status; @@ -4963,6 +5057,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, */ status = nfs4_file_check_deny(fp, open->op_share_deny); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4970,6 +5071,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, /* set access to the file */ status = nfs4_file_get_access(fp, open->op_share_access); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_access, true)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4985,9 +5093,19 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); - if (status) - goto out_put_access; + + if (!open->op_filp) { + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + } else { + status = nfsd_file_create(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + nf->nf_file = open->op_filp; + open->op_filp = NULL; + } + spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { fp->fi_fds[oflag] = nf; @@ -5016,21 +5134,29 @@ out_put_access: } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) { __be32 status; unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) - return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false); /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); if (status == nfs_ok) { - set_deny(open->op_share_deny, stp); - fp->fi_share_deny |= + if (status != nfserr_share_denied) { + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } else { + if (nfs4_resolve_deny_conflicts_locked(fp, false, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; + } } spin_unlock(&fp->fi_lock); @@ -5322,6 +5448,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, */ } +/** + * nfsd4_process_open2 - finish open processing + * @rqstp: the RPC transaction being executed + * @current_fh: NFSv4 COMPOUND's current filehandle + * @open: OPEN arguments + * + * If successful, (1) truncate the file if open->op_truncate was + * set, (2) set open->op_stateid, (3) set open->op_delegation. + * + * Returns %nfs_ok on success; otherwise an nfs4stat value in + * network byte order is returned. + */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { @@ -5371,7 +5509,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } } else { - status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); @@ -5605,6 +5743,81 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) } #endif +/* Check if any lock belonging to this lockowner has any blockers */ +static bool +nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) +{ + struct file_lock_context *ctx; + struct nfs4_ol_stateid *stp; + struct nfs4_file *nf; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + nf = stp->st_stid.sc_file; + ctx = nf->fi_inode->i_flctx; + if (!ctx) + continue; + if (locks_owner_has_blockers(ctx, lo)) + return true; + } + return false; +} + +static bool +nfs4_anylock_blockers(struct nfs4_client *clp) +{ + int i; + struct nfs4_stateowner *so; + struct nfs4_lockowner *lo; + + if (atomic_read(&clp->cl_delegs_in_recall)) + return true; + spin_lock(&clp->cl_lock); + for (i = 0; i < OWNER_HASH_SIZE; i++) { + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + if (so->so_is_open_owner) + continue; + lo = lockowner(so); + if (nfs4_lockowner_has_blockers(lo)) { + spin_unlock(&clp->cl_lock); + return true; + } + } + } + spin_unlock(&clp->cl_lock); + return false; +} + +static void +nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, + struct laundry_time *lt) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + + INIT_LIST_HEAD(reaplist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state == NFSD4_EXPIRABLE) + goto exp_client; + if (!state_expired(lt, clp->cl_time)) + break; + if (!atomic_read(&clp->cl_rpc_users)) + clp->cl_state = NFSD4_COURTESY; + if (!client_has_state(clp) || + ktime_get_boottime_seconds() >= + (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT)) + goto exp_client; + if (nfs4_anylock_blockers(clp)) { +exp_client: + if (!mark_client_expired_locked(clp)) + list_add(&clp->cl_lru, reaplist); + } + } + spin_unlock(&nn->client_lock); +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5627,7 +5840,6 @@ nfs4_laundromat(struct nfsd_net *nn) goto out; } nfsd4_end_grace(nn); - INIT_LIST_HEAD(&reaplist); spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { @@ -5637,17 +5849,7 @@ nfs4_laundromat(struct nfsd_net *nn) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); - - spin_lock(&nn->client_lock); - list_for_each_safe(pos, next, &nn->client_lru) { - clp = list_entry(pos, struct nfs4_client, cl_lru); - if (!state_expired(<, clp->cl_time)) - break; - if (mark_client_expired_locked(clp)) - continue; - list_add(&clp->cl_lru, &reaplist); - } - spin_unlock(&nn->client_lock); + nfs4_get_client_reaplist(nn, &reaplist, <); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); trace_nfsd_clid_purged(&clp->cl_clientid); @@ -5722,7 +5924,6 @@ out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } -static struct workqueue_struct *laundry_wq; static void laundromat_main(struct work_struct *); static void @@ -6551,6 +6752,29 @@ nfsd4_lm_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +/* return pointer to struct nfs4_client if client is expirable */ +static bool +nfsd4_lm_lock_expirable(struct file_lock *cfl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner; + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn; + + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + return true; + } + return false; +} + +/* schedule laundromat to run immediately and wait for it to complete */ +static void +nfsd4_lm_expire_lock(void) +{ + flush_workqueue(laundry_wq); +} + static void nfsd4_lm_notify(struct file_lock *fl) { @@ -6577,9 +6801,12 @@ nfsd4_lm_notify(struct file_lock *fl) } static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_mod_owner = THIS_MODULE, .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_lm_get_owner, .lm_put_owner = nfsd4_lm_put_owner, + .lm_lock_expirable = nfsd4_lm_lock_expirable, + .lm_expire_lock = nfsd4_lm_expire_lock, }; static inline void @@ -7297,22 +7524,36 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } +/** + * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations + * @rqstp: RPC transaction + * @cstate: NFSv4 COMPOUND state + * @u: RELEASE_LOCKOWNER arguments + * + * The lockowner's so_count is bumped when a lock record is added + * or when copying a conflicting lock. The latter case is brief, + * but can lead to fleeting false positives when looking for + * locks-in-use. + * + * Return values: + * %nfs_ok: lockowner released or not found + * %nfserr_locks_held: lockowner still in use + * %nfserr_stale_clientid: clientid no longer active + * %nfserr_expired: clientid not recognized + */ __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); clientid_t *clid = &rlockowner->rl_clientid; - struct nfs4_stateowner *sop; - struct nfs4_lockowner *lo = NULL; struct nfs4_ol_stateid *stp; - struct xdr_netobj *owner = &rlockowner->rl_owner; - unsigned int hashval = ownerstr_hashval(owner); - __be32 status; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_lockowner *lo; struct nfs4_client *clp; - LIST_HEAD (reaplist); + LIST_HEAD(reaplist); + __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -7320,34 +7561,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, status = set_client(clid, cstate, nn); if (status) return status; - clp = cstate->clp; - /* Find the matching lock stateowner */ - spin_lock(&clp->cl_lock); - list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], - so_strhash) { - - if (sop->so_is_open_owner || !same_owner_str(sop, owner)) - continue; - /* see if there are still any locks associated with it */ - lo = lockowner(sop); - list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_stid.sc_file, lo)) { - status = nfserr_locks_held; - spin_unlock(&clp->cl_lock); - return status; - } - } - - nfs4_get_stateowner(sop); - break; - } + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner); if (!lo) { spin_unlock(&clp->cl_lock); - return status; + return nfs_ok; + } + if (atomic_read(&lo->lo_owner.so_count) != 2) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; } - unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, @@ -7357,11 +7583,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); - - return status; + return nfs_ok; } static inline struct nfs4_client_reclaim * @@ -7602,22 +7828,12 @@ nfs4_state_start(void) { int ret; - laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); - if (laundry_wq == NULL) { - ret = -ENOMEM; - goto out; - } ret = nfsd4_create_callback_queue(); if (ret) - goto out_free_laundry; + return ret; set_max_delegations(); return 0; - -out_free_laundry: - destroy_workqueue(laundry_wq); -out: - return ret; } void @@ -7654,7 +7870,6 @@ nfs4_state_shutdown_net(struct net *net) void nfs4_state_shutdown(void) { - destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index da92e7d2ab6a..61b2aae81abb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2411,7 +2411,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); return true; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 0b3f12aa37ff..7da88bdc0d6c 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -206,7 +206,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct svc_cacherep *rp; unsigned int i; - nfsd_reply_cache_stats_destroy(nn); unregister_shrinker(&nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { @@ -217,6 +216,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) rp, nn); } } + nfsd_reply_cache_stats_destroy(nn); kvfree(nn->drc_hashtbl); nn->drc_hashtbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 16920e4512bd..0621c2faf242 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1535,20 +1535,25 @@ static int __init init_nfsd(void) retval = create_proc_exports_entry(); if (retval) goto out_free_lockd; - retval = register_filesystem(&nfsd_fs_type); - if (retval) - goto out_free_exports; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_filesystem; + goto out_free_exports; retval = register_cld_notifier(); if (retval) + goto out_free_subsys; + retval = nfsd4_create_laundry_wq(); + if (retval) + goto out_free_cld; + retval = register_filesystem(&nfsd_fs_type); + if (retval) goto out_free_all; return 0; out_free_all: + nfsd4_destroy_laundry_wq(); +out_free_cld: + unregister_cld_notifier(); +out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_filesystem: - unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); @@ -1566,6 +1571,8 @@ out_free_slabs: static void __exit exit_nfsd(void) { + unregister_filesystem(&nfsd_fs_type); + nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); @@ -1575,7 +1582,6 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); - unregister_filesystem(&nfsd_fs_type); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 4fc1fd639527..847b482155ae 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -162,6 +162,8 @@ void nfs4_state_shutdown_net(struct net *net); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); +int nfsd4_create_laundry_wq(void); +void nfsd4_destroy_laundry_wq(void); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -175,6 +177,8 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { return false; } +static inline int nfsd4_create_laundry_wq(void) { return 0; }; +static inline void nfsd4_destroy_laundry_wq(void) {}; #endif /* @@ -336,6 +340,7 @@ void nfsd_lockd_shutdown(void); #define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ +#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ /* * The following attributes are currently not supported by the NFSv4 server: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 95457cfd37fc..f3d6313914ed 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -149,6 +149,7 @@ struct nfs4_delegation { /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; + bool dl_recalled; }; #define cb_to_delegation(cb) \ @@ -283,6 +284,28 @@ struct nfsd4_sessionid { #define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ /* + * State Meaning Where set + * -------------------------------------------------------------------------- + * | NFSD4_ACTIVE | Confirmed, active | Default | + * |------------------- ----------------------------------------------------| + * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist | + * | | Lease/lock/share | | + * | | reservation conflict | | + * | | can cause Courtesy | | + * | | client to be expired | | + * |------------------------------------------------------------------------| + * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat | + * | | expired by Laundromat| try_to_expire_client | + * | | due to conflict | | + * |------------------------------------------------------------------------| + */ +enum { + NFSD4_ACTIVE = 0, + NFSD4_COURTESY, + NFSD4_EXPIRABLE, +}; + +/* * struct nfs4_client - one per client. Clientids live here. * * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0) @@ -385,6 +408,9 @@ struct nfs4_client { struct list_head async_copies; /* list of async copies */ spinlock_t async_lock; /* lock for async copies */ atomic_t cl_cb_inflight; /* Outstanding callbacks */ + + unsigned int cl_state; + atomic_t cl_delegs_in_recall; }; /* struct nfs4_client_reset @@ -702,4 +728,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); extern void nfsd4_record_grace_done(struct nfsd_net *nn); +static inline bool try_to_expire_client(struct nfs4_client *clp) +{ + cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); + return clp->cl_state == NFSD4_EXPIRABLE; +} #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 242fa123e0e9..a60ead3b227a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -692,12 +692,6 @@ DEFINE_CLID_EVENT(confirmed_r); /* * from fs/nfsd/filecache.h */ -TRACE_DEFINE_ENUM(NFSD_FILE_HASHED); -TRACE_DEFINE_ENUM(NFSD_FILE_PENDING); -TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ); -TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE); -TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED); - #define show_nf_flags(val) \ __print_flags(val, "|", \ { 1 << NFSD_FILE_HASHED, "HASHED" }, \ @@ -784,6 +778,34 @@ TRACE_EVENT(nfsd_file_acquire, __entry->nf_file, __entry->status) ); +TRACE_EVENT(nfsd_file_open, + TP_PROTO(struct nfsd_file *nf, __be32 status), + TP_ARGS(nf, status), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) /* cannot be dereferenced */ + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(void *, nf_file) /* cannot be dereferenced */ + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file) +) + DECLARE_EVENT_CLASS(nfsd_file_search_class, TP_PROTO(struct inode *inode, unsigned int hash, int found), TP_ARGS(inode, hash, found), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c22ad0532e8e..840e3af63a6f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -827,14 +827,23 @@ retry: return err; } +/** + * nfsd_open_verified - Open a regular file for the filecache + * @rqstp: RPC request + * @fhp: NFS filehandle of the file to open + * @may_flags: internal permission flags + * @filp: OUT: open "struct file *" + * + * Returns an nfsstat value in network byte order. + */ __be32 -nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, - int may_flags, struct file **filp) +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, + struct file **filp) { __be32 err; validate_process_creds(); - err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); validate_process_creds(); return err; } @@ -849,17 +858,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - struct page **pp = rqstp->rq_next_page; - struct page *page = buf->page; - if (rqstp->rq_res.page_len == 0) { - svc_rqst_replace_page(rqstp, page); + svc_rqst_replace_page(rqstp, buf->page); + if (rqstp->rq_res.page_len == 0) rqstp->rq_res.page_base = buf->offset; - } else if (page != pp[-1]) { - svc_rqst_replace_page(rqstp, page); - } rqstp->rq_res.page_len += sd->len; - return sd->len; } @@ -1187,14 +1190,26 @@ out: return err; } -static __be32 -nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, - struct iattr *iap) +/** + * nfsd_create_setattr - Set a created file's attributes + * @rqstp: RPC transaction being executed + * @fhp: NFS filehandle of parent directory + * @resfhp: NFS filehandle of new object + * @iap: requested attributes of new object + * + * Returns nfs_ok on success, or an nfsstat in network byte order. + */ +__be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct iattr *iap) { + __be32 status; + /* - * Mode has already been set earlier in create: + * Mode has already been set by file creation. */ iap->ia_valid &= ~ATTR_MODE; + /* * Setting uid/gid works only for root. Irix appears to * send along the gid on create when it tries to implement @@ -1202,10 +1217,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, */ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + + /* + * Callers expect new file metadata to be committed even + * if the attributes have not changed. + */ if (iap->ia_valid) - return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); - /* Callers expect file metadata to be committed here */ - return nfserrno(commit_metadata(resfhp)); + status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); + else + status = nfserrno(commit_metadata(resfhp)); + + /* + * Transactional filesystems had a chance to commit changes + * for both parent and child simultaneously making the + * following commit_metadata a noop in many cases. + */ + if (!status) + status = nfserrno(commit_metadata(fhp)); + + /* + * Update the new filehandle to pick up the new attributes. + */ + if (!status) + status = fh_update(resfhp); + + return status; } /* HPUX client sometimes creates a file in mode 000, and sets size to 0. @@ -1232,7 +1268,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild; struct inode *dirp; __be32 err; - __be32 err2; int host_err; dentry = fhp->fh_dentry; @@ -1305,22 +1340,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err < 0) goto out_nfserr; - err = nfsd_create_setattr(rqstp, resfhp, iap); + err = nfsd_create_setattr(rqstp, fhp, resfhp, iap); - /* - * nfsd_create_setattr already committed the child. Transactional - * filesystems had a chance to commit changes for both parent and - * child simultaneously making the following commit_metadata a - * noop. - */ - err2 = nfserrno(commit_metadata(fhp)); - if (err2) - err = err2; - /* - * Update the file handle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); out: dput(dchild); return err; @@ -1376,172 +1397,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, } /* - * NFSv3 and NFSv4 version of nfsd_create - */ -__be32 -do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct iattr *iap, - struct svc_fh *resfhp, int createmode, u32 *verifier, - bool *truncp, bool *created) -{ - struct dentry *dentry, *dchild = NULL; - struct inode *dirp; - __be32 err; - int host_err; - __u32 v_mtime=0, v_atime=0; - - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - if (!(iap->ia_valid & ATTR_MODE)) - iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); - if (err) - goto out; - - dentry = fhp->fh_dentry; - dirp = d_inode(dentry); - - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - fh_lock_nested(fhp, I_MUTEX_PARENT); - - /* - * Compose the response file handle. - */ - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - - /* If file doesn't exist, check for permissions to create one */ - if (d_really_is_negative(dchild)) { - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - } - - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; - - if (nfsd_create_is_exclusive(createmode)) { - /* solaris7 gets confused (bugid 4218508) if these have - * the high bit set, as do xfs filesystems without the - * "bigtime" feature. So just clear the high bits. If this is - * ever changed to use different attrs for storing the - * verifier, then do_open_lookup() will also need to be fixed - * accordingly. - */ - v_mtime = verifier[0]&0x7fffffff; - v_atime = verifier[1]&0x7fffffff; - } - - if (d_really_is_positive(dchild)) { - err = 0; - - switch (createmode) { - case NFS3_CREATE_UNCHECKED: - if (! d_is_reg(dchild)) - goto out; - else if (truncp) { - /* in nfsv4, we need to treat this case a little - * differently. we don't want to truncate the - * file now; this would be wrong if the OPEN - * fails for some other reason. furthermore, - * if the size is nonzero, we should ignore it - * according to spec! - */ - *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; - } - else { - iap->ia_valid &= ATTR_SIZE; - goto set_attr; - } - break; - case NFS3_CREATE_EXCLUSIVE: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = true; - break; - } - fallthrough; - case NFS4_CREATE_EXCLUSIVE4_1: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = true; - goto set_attr; - } - fallthrough; - case NFS3_CREATE_GUARDED: - err = nfserr_exist; - } - fh_drop_write(fhp); - goto out; - } - - if (!IS_POSIXACL(dirp)) - iap->ia_mode &= ~current_umask(); - - host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); - if (host_err < 0) { - fh_drop_write(fhp); - goto out_nfserr; - } - if (created) - *created = true; - - nfsd_check_ignore_resizing(iap); - - if (nfsd_create_is_exclusive(createmode)) { - /* Cram the verifier into atime/mtime */ - iap->ia_valid = ATTR_MTIME|ATTR_ATIME - | ATTR_MTIME_SET|ATTR_ATIME_SET; - /* XXX someone who knows this better please fix it for nsec */ - iap->ia_mtime.tv_sec = v_mtime; - iap->ia_atime.tv_sec = v_atime; - iap->ia_mtime.tv_nsec = 0; - iap->ia_atime.tv_nsec = 0; - } - - set_attr: - err = nfsd_create_setattr(rqstp, resfhp, iap); - - /* - * nfsd_create_setattr already committed the child - * (and possibly also the parent). - */ - if (!err) - err = nfserrno(commit_metadata(fhp)); - - /* - * Update the filehandle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); - - out: - fh_unlock(fhp); - if (dchild && !IS_ERR(dchild)) - dput(dchild); - fh_drop_write(fhp); - return err; - - out_nfserr: - err = nfserrno(host_err); - goto out; -} - -/* * Read a symlink. On entry, *lenp must contain the maximum path length that * fits into the buffer. On return, it contains the true length. * N.B. After this call fhp needs an fh_put diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ccb87b2864f6..26347d76f44a 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -69,10 +69,8 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - struct svc_fh *res, int createmode, - u32 *verifier, bool *truncp, bool *created); +__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct iattr *iap); __be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, u64 offset, u32 count, __be32 *verf); #ifdef CONFIG_NFSD_V4 @@ -88,7 +86,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, @@ -159,10 +157,4 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) AT_STATX_SYNC_AS_STAT)); } -static inline int nfsd_create_is_exclusive(int createmode) -{ - return createmode == NFS3_CREATE_EXCLUSIVE - || createmode == NFS4_CREATE_EXCLUSIVE4_1; -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 846ab6df9d48..7b744011f2d3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,6 +273,7 @@ struct nfsd4_open { bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ + struct file *op_filp; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_clnt_odstate *op_odstate; /* used during processing */ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 829dd4a61b66..190aa717fa32 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -168,7 +168,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); - mutex_lock(&dnotify_group->mark_mutex); + fsnotify_group_lock(dnotify_group); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; @@ -191,7 +191,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) free = true; } - mutex_unlock(&dnotify_group->mark_mutex); + fsnotify_group_unlock(dnotify_group); if (free) fsnotify_free_mark(fsn_mark); @@ -324,7 +324,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ - mutex_lock(&dnotify_group->mark_mutex); + fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); @@ -334,7 +334,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) } else { error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0); if (error) { - mutex_unlock(&dnotify_group->mark_mutex); + fsnotify_group_unlock(dnotify_group); goto out_err; } spin_lock(&new_fsn_mark->lock); @@ -383,7 +383,7 @@ out: if (destroy) fsnotify_detach_mark(fsn_mark); - mutex_unlock(&dnotify_group->mark_mutex); + fsnotify_group_unlock(dnotify_group); if (destroy) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); @@ -401,7 +401,8 @@ static int __init dnotify_init(void) SLAB_PANIC|SLAB_ACCOUNT); dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); - dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); + dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, + FSNOTIFY_GROUP_NOFS); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); dnotify_sysctl_init(); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 985e995d2a39..4f897e109547 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -319,12 +319,8 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } - fsnotify_foreach_iter_type(type) { - if (!fsnotify_iter_should_report_type(iter_info, type)) - continue; - mark = iter_info->marks[type]; - - /* Apply ignore mask regardless of ISDIR and ON_CHILD flags */ + fsnotify_foreach_iter_mark_type(iter_info, mark, type) { + /* Apply ignore mask regardless of mark's ISDIR flag */ marks_ignored_mask |= mark->ignored_mask; /* @@ -334,14 +330,6 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) continue; - /* - * If the event is on a child and this mark is on a parent not - * watching children, don't send it! - */ - if (type == FSNOTIFY_ITER_TYPE_PARENT && - !(mark->mask & FS_EVENT_ON_CHILD)) - continue; - marks_mask |= mark->mask; /* Record the mark types of this group that matched the event */ @@ -849,16 +837,14 @@ out: */ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) { + struct fsnotify_mark *mark; int type; __kernel_fsid_t fsid = {}; - fsnotify_foreach_iter_type(type) { + fsnotify_foreach_iter_mark_type(iter_info, mark, type) { struct fsnotify_mark_connector *conn; - if (!fsnotify_iter_should_report_type(iter_info, type)) - continue; - - conn = READ_ONCE(iter_info->marks[type]->connector); + conn = READ_ONCE(mark->connector); /* Mark is just getting destroyed or created? */ if (!conn) continue; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index a3d5b751cac5..80e0ec95b113 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -490,3 +490,15 @@ static inline unsigned int fanotify_event_hash_bucket( { return event->hash & FANOTIFY_HTABLE_MASK; } + +static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) +{ + unsigned int mflags = 0; + + if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + mflags |= FAN_MARK_IGNORED_SURV_MODIFY; + if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) + mflags |= FAN_MARK_EVICTABLE; + + return mflags; +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a792e21c5309..c2255b440df9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -264,7 +264,7 @@ static int create_fd(struct fsnotify_group *group, struct path *path, * originally opened O_WRONLY. */ new_file = dentry_open(path, - group->fanotify_data.f_flags | FMODE_NONOTIFY, + group->fanotify_data.f_flags | __FMODE_NONOTIFY, current_cred()); if (IS_ERR(new_file)) { /* @@ -1035,10 +1035,10 @@ static int fanotify_remove_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; - mutex_lock(&group->mark_mutex); + fsnotify_group_lock(group); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); return -ENOENT; } @@ -1048,7 +1048,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group, fsnotify_recalc_mask(fsn_mark->connector); if (destroy_mark) fsnotify_detach_mark(fsn_mark); - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); if (destroy_mark) fsnotify_free_mark(fsn_mark); @@ -1081,47 +1081,63 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, flags, umask); } -static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark, - __u32 mask, unsigned int flags, - __u32 *removed) +static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) { - fsn_mark->ignored_mask |= mask; + bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + bool recalc = false; /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set * because of an ignored mask that is now going to survive FS_MODIFY. */ - if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) && + if ((fan_flags & FAN_MARK_IGNORED_MASK) && + (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) - *removed = FS_MODIFY; + recalc = true; } + + if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE || + want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return recalc; + + /* + * NO_IREF may be removed from a mark, but not added. + * When removed, fsnotify_recalc_mask() will take the inode ref. + */ + WARN_ON_ONCE(!want_iref); + fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF; + + return true; } -static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, - __u32 mask, unsigned int flags, - __u32 *removed) +static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, unsigned int fan_flags) { - __u32 oldmask, newmask; + bool recalc; spin_lock(&fsn_mark->lock); - oldmask = fsnotify_calc_mask(fsn_mark); - if (!(flags & FAN_MARK_IGNORED_MASK)) { + if (!(fan_flags & FAN_MARK_IGNORED_MASK)) fsn_mark->mask |= mask; - } else { - fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed); - } - newmask = fsnotify_calc_mask(fsn_mark); + else + fsn_mark->ignored_mask |= mask; + + recalc = fsnotify_calc_mask(fsn_mark) & + ~fsnotify_conn_mask(fsn_mark->connector); + + recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags); spin_unlock(&fsn_mark->lock); - return newmask & ~oldmask; + return recalc; } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, + unsigned int fan_flags, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; @@ -1144,6 +1160,9 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, } fsnotify_init_mark(mark, group); + if (fan_flags & FAN_MARK_EVICTABLE) + mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; + ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); if (ret) { fsnotify_put_mark(mark); @@ -1170,39 +1189,49 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, - __u32 mask, unsigned int flags, + __u32 mask, unsigned int fan_flags, __kernel_fsid_t *fsid) { struct fsnotify_mark *fsn_mark; - __u32 added, removed = 0; + bool recalc; int ret = 0; - mutex_lock(&group->mark_mutex); + fsnotify_group_lock(group); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid); + fsn_mark = fanotify_add_new_mark(group, connp, obj_type, + fan_flags, fsid); if (IS_ERR(fsn_mark)) { - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); return PTR_ERR(fsn_mark); } } /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) { + ret = -EEXIST; + goto out; + } + + /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ - if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; } - added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed); - if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector))) + recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); + if (recalc) fsnotify_recalc_mask(fsn_mark->connector); out: - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); return ret; @@ -1348,14 +1377,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL; - f_flags = O_RDWR | FMODE_NONOTIFY; + f_flags = O_RDWR | __FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK) f_flags |= O_NONBLOCK; /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ - group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); + group = fsnotify_alloc_group(&fanotify_fsnotify_ops, + FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS); if (IS_ERR(group)) { return PTR_ERR(group); } @@ -1598,6 +1628,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* + * Evictable is only relevant for inode marks, because only inode object + * can be evicted on memory pressure. + */ + if (flags & FAN_MARK_EVICTABLE && + mark_type != FAN_MARK_INODE) + goto fput_and_out; + + /* * Events that do not carry enough information to report * event->fd require a group that supports reporting fid. Those * events are not supported on a mount mark, because they do not @@ -1762,7 +1800,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 57f0d5d9f934..59fb40abe33d 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -14,6 +14,7 @@ #include <linux/exportfs.h> #include "inotify/inotify.h" +#include "fanotify/fanotify.h" #include "fdinfo.h" #include "fsnotify.h" @@ -28,13 +29,13 @@ static void show_fdinfo(struct seq_file *m, struct file *f, struct fsnotify_group *group = f->private_data; struct fsnotify_mark *mark; - mutex_lock(&group->mark_mutex); + fsnotify_group_lock(group); list_for_each_entry(mark, &group->marks_list, g_list) { show(m, mark); if (seq_has_overflowed(m)) break; } - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); } #if defined(CONFIG_EXPORTFS) @@ -83,16 +84,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - /* - * IN_ALL_EVENTS represents all of the mask bits - * that we expose to userspace. There is at - * least one bit (FS_EVENT_ON_CHILD) which is - * used only internally to the kernel. - */ - u32 mask = mark->mask & IN_ALL_EVENTS; - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mask, mark->ignored_mask); + inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); @@ -110,12 +104,9 @@ void inotify_show_fdinfo(struct seq_file *m, struct file *f) static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) { - unsigned int mflags = 0; + unsigned int mflags = fanotify_mark_user_flags(mark); struct inode *inode; - if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) - mflags |= FAN_MARK_IGNORED_SURV_MODIFY; - if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = igrab(fsnotify_conn_inode(mark->connector)); if (!inode) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 70a8516b78bc..0b3e74935cb4 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -253,7 +253,7 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group, if (WARN_ON_ONCE(!inode && !dir)) return 0; - if ((inode_mark->mask & FS_EXCL_UNLINK) && + if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; @@ -290,22 +290,15 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, } if (parent_mark) { - /* - * parent_mark indicates that the parent inode is watching - * children and interested in this event, which is an event - * possible on child. But is *this mark* watching children and - * interested in this event? - */ - if (parent_mark->mask & FS_EVENT_ON_CHILD) { - ret = fsnotify_handle_inode_event(group, parent_mark, mask, - data, data_type, dir, name, 0); - if (ret) - return ret; - } - if (!inode_mark) - return 0; + ret = fsnotify_handle_inode_event(group, parent_mark, mask, + data, data_type, dir, name, 0); + if (ret) + return ret; } + if (!inode_mark) + return 0; + if (mask & FS_EVENT_ON_CHILD) { /* * Some events can be sent on both parent dir and child marks @@ -335,31 +328,23 @@ static int send_to_group(__u32 mask, const void *data, int data_type, struct fsnotify_mark *mark; int type; - if (WARN_ON(!iter_info->report_mask)) + if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { - fsnotify_foreach_iter_type(type) { - if (!fsnotify_iter_should_report_type(iter_info, type)) - continue; - mark = iter_info->marks[type]; - if (mark && - !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) + fsnotify_foreach_iter_mark_type(iter_info, mark, type) { + if (!(mark->flags & + FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignored_mask = 0; } } - fsnotify_foreach_iter_type(type) { - if (!fsnotify_iter_should_report_type(iter_info, type)) - continue; - mark = iter_info->marks[type]; - /* does the object mark tell us to do something? */ - if (mark) { - group = mark->group; - marks_mask |= mark->mask; - marks_ignored_mask |= mark->ignored_mask; - } + /* Are any of the group marks interested in this event? */ + fsnotify_foreach_iter_mark_type(iter_info, mark, type) { + group = mark->group; + marks_mask |= mark->mask; + marks_ignored_mask |= mark->ignored_mask; } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", @@ -403,11 +388,11 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) /* * iter_info is a multi head priority queue of marks. - * Pick a subset of marks from queue heads, all with the - * same group and set the report_mask for selected subset. - * Returns the report_mask of the selected subset. + * Pick a subset of marks from queue heads, all with the same group + * and set the report_mask to a subset of the selected marks. + * Returns false if there are no more groups to iterate. */ -static unsigned int fsnotify_iter_select_report_types( +static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; @@ -423,30 +408,48 @@ static unsigned int fsnotify_iter_select_report_types( } if (!max_prio_group) - return 0; + return false; /* Set the report mask for marks from same group as max prio group */ + iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; - if (mark && - fsnotify_compare_groups(max_prio_group, mark->group) == 0) + if (mark && mark->group == iter_info->current_group) { + /* + * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode + * is watching children and interested in this event, + * which is an event possible on child. + * But is *this mark* watching children? + */ + if (type == FSNOTIFY_ITER_TYPE_PARENT && + !(mark->mask & FS_EVENT_ON_CHILD)) + continue; + fsnotify_iter_set_report_type(iter_info, type); + } } - return iter_info->report_mask; + return true; } /* - * Pop from iter_info multi head queue, the marks that were iterated in the + * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { + struct fsnotify_mark *mark; int type; + /* + * We cannot use fsnotify_foreach_iter_mark_type() here because we + * may need to advance a mark of type X that belongs to current_group + * but was not selected for reporting. + */ fsnotify_foreach_iter_type(type) { - if (fsnotify_iter_should_report_type(iter_info, type)) + mark = iter_info->marks[type]; + if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } @@ -581,7 +584,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/group.c b/fs/notify/group.c index b7d4d64f87c2..1de6631a3925 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -112,8 +112,10 @@ void fsnotify_put_group(struct fsnotify_group *group) EXPORT_SYMBOL_GPL(fsnotify_put_group); static struct fsnotify_group *__fsnotify_alloc_group( - const struct fsnotify_ops *ops, gfp_t gfp) + const struct fsnotify_ops *ops, + int flags, gfp_t gfp) { + static struct lock_class_key nofs_marks_lock; struct fsnotify_group *group; group = kzalloc(sizeof(struct fsnotify_group), gfp); @@ -133,6 +135,17 @@ static struct fsnotify_group *__fsnotify_alloc_group( INIT_LIST_HEAD(&group->marks_list); group->ops = ops; + group->flags = flags; + /* + * For most backends, eviction of inode with a mark is not expected, + * because marks hold a refcount on the inode against eviction. + * + * Use a different lockdep class for groups that support evictable + * inode marks, because with evictable marks, mark_mutex is NOT + * fs-reclaim safe - the mutex is taken when evicting inodes. + */ + if (flags & FSNOTIFY_GROUP_NOFS) + lockdep_set_class(&group->mark_mutex, &nofs_marks_lock); return group; } @@ -140,20 +153,15 @@ static struct fsnotify_group *__fsnotify_alloc_group( /* * Create a new fsnotify_group and hold a reference for the group returned. */ -struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops, + int flags) { - return __fsnotify_alloc_group(ops, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(fsnotify_alloc_group); + gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT : + GFP_KERNEL; -/* - * Create a new fsnotify_group and hold a reference for the group returned. - */ -struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops) -{ - return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT); + return __fsnotify_alloc_group(ops, flags, gfp); } -EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group); +EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 2007e3711916..7d5df7a21539 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -22,6 +22,25 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct inotify_event_info, fse); } +/* + * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to + * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ +#define INOTIFY_USER_MASK (IN_ALL_EVENTS) + +static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark) +{ + __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK; + + if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) + mask |= IN_EXCL_UNLINK; + if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT) + mask |= IN_ONESHOT; + + return mask; +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d92d7b0adc9a..49cfe2ae6d23 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -122,7 +122,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, fsnotify_destroy_event(group, fsn_event); } - if (inode_mark->mask & IN_ONESHOT) + if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); return 0; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 54583f62dc44..ed42a189faa2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -110,11 +110,26 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) mask |= FS_EVENT_ON_CHILD; /* mask off the flags used to open the fd */ - mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + mask |= (arg & INOTIFY_USER_MASK); return mask; } +#define INOTIFY_MARK_FLAGS \ + (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT) + +static inline unsigned int inotify_arg_to_flags(u32 arg) +{ + unsigned int flags = 0; + + if (arg & IN_EXCL_UNLINK) + flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK; + if (arg & IN_ONESHOT) + flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT; + + return flags; +} + static inline u32 inotify_mask_to_arg(__u32 mask) { return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | @@ -526,13 +541,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, struct fsnotify_mark *fsn_mark; struct inotify_inode_mark *i_mark; __u32 old_mask, new_mask; - __u32 mask; - int add = (arg & IN_MASK_ADD); + int replace = !(arg & IN_MASK_ADD); int create = (arg & IN_MASK_CREATE); int ret; - mask = inotify_arg_to_mask(inode, arg); - fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; @@ -545,10 +557,12 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, spin_lock(&fsn_mark->lock); old_mask = fsn_mark->mask; - if (add) - fsn_mark->mask |= mask; - else - fsn_mark->mask = mask; + if (replace) { + fsn_mark->mask = 0; + fsn_mark->flags &= ~INOTIFY_MARK_FLAGS; + } + fsn_mark->mask |= inotify_arg_to_mask(inode, arg); + fsn_mark->flags |= inotify_arg_to_flags(arg); new_mask = fsn_mark->mask; spin_unlock(&fsn_mark->lock); @@ -579,19 +593,17 @@ static int inotify_new_watch(struct fsnotify_group *group, u32 arg) { struct inotify_inode_mark *tmp_i_mark; - __u32 mask; int ret; struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; - mask = inotify_arg_to_mask(inode, arg); - tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) return -ENOMEM; fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); - tmp_i_mark->fsn_mark.mask = mask; + tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg); + tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg); tmp_i_mark->wd = -1; ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); @@ -628,13 +640,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { int ret = 0; - mutex_lock(&group->mark_mutex); + fsnotify_group_lock(group); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); return ret; } @@ -644,7 +656,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) struct fsnotify_group *group; struct inotify_event_info *oevent; - group = fsnotify_alloc_user_group(&inotify_fsnotify_ops); + group = fsnotify_alloc_group(&inotify_fsnotify_ops, + FSNOTIFY_GROUP_USER); if (IS_ERR(group)) return group; @@ -845,9 +858,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); - BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); - BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4853184f7dde..c74ef947447d 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -116,20 +116,64 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) return *fsnotify_conn_mask_p(conn); } -static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) +static void fsnotify_get_inode_ref(struct inode *inode) +{ + ihold(inode); + atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); +} + +/* + * Grab or drop inode reference for the connector if needed. + * + * When it's time to drop the reference, we only clear the HAS_IREF flag and + * return the inode object. fsnotify_drop_object() will be resonsible for doing + * iput() outside of spinlocks. This happens when last mark that wanted iref is + * detached. + */ +static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, + bool want_iref) +{ + bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; + struct inode *inode = NULL; + + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || + want_iref == has_iref) + return NULL; + + if (want_iref) { + /* Pin inode if any mark wants inode refcount held */ + fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); + conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; + } else { + /* Unpin inode after detach of last mark that wanted iref */ + inode = fsnotify_conn_inode(conn); + conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; + } + + return inode; +} + +static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; + bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) - return; + return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { - if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) - new_mask |= fsnotify_calc_mask(mark); + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) + continue; + new_mask |= fsnotify_calc_mask(mark); + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && + !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + want_iref = true; } *fsnotify_conn_mask_p(conn) = new_mask; + + return fsnotify_update_iref(conn, want_iref); } /* @@ -169,12 +213,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static void fsnotify_get_inode_ref(struct inode *inode) -{ - ihold(inode); - atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); -} - static void fsnotify_put_inode_ref(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -213,6 +251,10 @@ static void *fsnotify_detach_connector_from_object( if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; + + /* Unpin inode when detaching from connector */ + if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) + inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { @@ -274,7 +316,8 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { - __fsnotify_recalc_mask(conn); + objp = __fsnotify_recalc_mask(conn); + type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); @@ -398,9 +441,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { - struct fsnotify_group *group = mark->group; - - WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); + fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); @@ -452,9 +493,9 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + fsnotify_group_lock(group); fsnotify_detach_mark(mark); - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); @@ -499,7 +540,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, unsigned int obj_type, __kernel_fsid_t *fsid) { - struct inode *inode = NULL; struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); @@ -507,6 +547,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); + conn->flags = 0; conn->type = obj_type; conn->obj = connp; /* Cache fsid of filesystem containing the object */ @@ -517,10 +558,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->fsid.val[0] = conn->fsid.val[1] = 0; conn->flags = 0; } - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { - inode = fsnotify_conn_inode(conn); - fsnotify_get_inode_ref(inode); - } fsnotify_get_sb_connectors(conn); /* @@ -529,8 +566,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ - if (inode) - fsnotify_put_inode_ref(inode); fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } @@ -574,7 +609,7 @@ out: static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int allow_dups, __kernel_fsid_t *fsid) + int add_flags, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -633,7 +668,7 @@ restart: if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && - !allow_dups) { + !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } @@ -668,12 +703,12 @@ out_err: */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int allow_dups, __kernel_fsid_t *fsid) + int add_flags, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; int ret = 0; - BUG_ON(!mutex_is_locked(&group->mark_mutex)); + fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! @@ -688,12 +723,11 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); if (ret) goto err; - if (mark->mask || mark->ignored_mask) - fsnotify_recalc_mask(mark->connector); + fsnotify_recalc_mask(mark->connector); return ret; err: @@ -708,15 +742,15 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int allow_dups, + unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; - mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid); - mutex_unlock(&group->mark_mutex); + fsnotify_group_lock(group); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); + fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); @@ -770,24 +804,24 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); clear: while (1) { - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + fsnotify_group_lock(group); if (list_empty(head)) { - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e1392a9b8ceb..a8abe2296514 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1772,11 +1772,11 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, last_vcn = -1; do { VCN vcn; - pgoff_t idx, start_idx; + pgoff_t start_idx; unsigned ofs, do_pages, u; size_t copied; - start_idx = idx = pos >> PAGE_SHIFT; + start_idx = pos >> PAGE_SHIFT; ofs = pos & ~PAGE_MASK; bytes = PAGE_SIZE - ofs; do_pages = 1; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a4fcdc7927ca..8e9d2b35175f 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -492,7 +492,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, - &new_valid, true, NULL); + &new_valid, ni->mi.sbi->options->prealloc, NULL); up_write(&ni->file.run_lock); if (new_valid < ni->i_valid) @@ -659,7 +659,13 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) /* * Normal file: Allocate clusters, do not change 'valid' size. */ - err = ntfs_set_size(inode, max(end, i_size)); + loff_t new_size = max(end, i_size); + + err = inode_newsize_ok(inode, new_size); + if (err) + goto out; + + err = ntfs_set_size(inode, new_size); if (err) goto out; @@ -759,7 +765,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } inode_dio_wait(inode); - if (attr->ia_size < oldsize) + if (attr->ia_size <= oldsize) err = ntfs_truncate(inode, attr->ia_size); else if (attr->ia_size > oldsize) err = ntfs_extend(inode, attr->ia_size, 0, NULL); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 6f47a9c17f89..18842998c8fa 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -1964,10 +1964,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, vcn += clen; - if (vbo + bytes >= end) { + if (vbo + bytes >= end) bytes = end - vbo; - flags |= FIEMAP_EXTENT_LAST; - } if (vbo + bytes <= valid) { ; @@ -1977,6 +1975,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, /* vbo < valid && valid < vbo + bytes */ u64 dlen = valid - vbo; + if (vbo + dlen >= end) + flags |= FIEMAP_EXTENT_LAST; + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, flags); if (err < 0) @@ -1995,6 +1996,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_UNWRITTEN; } + if (vbo + bytes >= end) + flags |= FIEMAP_EXTENT_LAST; + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); if (err < 0) break; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 06492f088d60..49b7df616778 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -1185,8 +1185,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, if (!r_page) return -ENOMEM; - memset(info, 0, sizeof(struct restart_info)); - /* Determine which restart area we are looking for. */ if (first) { vbo = 0; @@ -3791,10 +3789,11 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) if (!log) return -ENOMEM; + memset(&rst_info, 0, sizeof(struct restart_info)); + log->ni = ni; log->l_size = l_size; log->one_page_buf = kmalloc(page_size, GFP_NOFS); - if (!log->one_page_buf) { err = -ENOMEM; goto out; @@ -3842,6 +3841,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) if (rst_info.vbo) goto check_restart_area; + memset(&rst_info2, 0, sizeof(struct restart_info)); err = log_read_rst(log, l_size, false, &rst_info2); /* Determine which restart area to use. */ @@ -4085,8 +4085,10 @@ process_log: if (client == LFS_NO_CLIENT_LE) { /* Insert "NTFS" client LogFile. */ client = ra->client_idx[0]; - if (client == LFS_NO_CLIENT_LE) - return -EINVAL; + if (client == LFS_NO_CLIENT_LE) { + err = -EINVAL; + goto out; + } t16 = le16_to_cpu(client); cr = ca + t16; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 74f60c457f28..be4ebdd8048b 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -758,6 +758,7 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t vbo = iocb->ki_pos; loff_t end; int wr = iov_iter_rw(iter) & WRITE; + size_t iter_count = iov_iter_count(iter); loff_t valid; ssize_t ret; @@ -771,10 +772,13 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) wr ? ntfs_get_block_direct_IO_W : ntfs_get_block_direct_IO_R); - if (ret <= 0) + if (ret > 0) + end = vbo + ret; + else if (wr && ret == -EIOCBQUEUED) + end = vbo + iter_count; + else goto out; - end = vbo + ret; valid = ni->i_valid; if (wr) { if (end > valid && !S_ISBLK(inode->i_mode)) { @@ -1950,6 +1954,7 @@ const struct address_space_operations ntfs_aops = { .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, }; const struct address_space_operations ntfs_aops_cmpr = { diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 5781b9e8e3d8..0c6de6287737 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -668,9 +668,11 @@ static u32 format_size_gb(const u64 bytes, u32 *mb) static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) { - return boot->sectors_per_clusters <= 0x80 - ? boot->sectors_per_clusters - : (1u << (0 - boot->sectors_per_clusters)); + if (boot->sectors_per_clusters <= 0x80) + return boot->sectors_per_clusters; + if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ + return 1U << (0 - boot->sectors_per_clusters); + return -EINVAL; } /* @@ -713,6 +715,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); + if ((int)sct_per_clst < 0) + goto out; if (!is_power_of_2(sct_per_clst)) goto out; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index afd0ddad826f..5e0e0280e70d 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -112,7 +112,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, return -ENOMEM; if (!size) { - ; + /* EA info persists, but xattr is empty. Looks like EA problem. */ } else if (attr_ea->non_res) { struct runs_tree run; @@ -259,7 +259,7 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags) + size_t val_size, int flags, bool locked) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -278,7 +278,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, u64 new_sz; void *p; - ni_lock(ni); + if (!locked) + ni_lock(ni); run_init(&ea_run); @@ -467,7 +468,8 @@ update_ea: mark_inode_dirty(&ni->vfs_inode); out: - ni_unlock(ni); + if (!locked) + ni_unlock(ni); run_close(&ea_run); kfree(ea_all); @@ -541,7 +543,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, - int type) + int type, bool init_acl) { const char *name; size_t size, name_len; @@ -554,8 +556,9 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, switch (type) { case ACL_TYPE_ACCESS: - if (acl) { - umode_t mode = inode->i_mode; + /* Do not change i_mode if we are in init_acl */ + if (acl && !init_acl) { + umode_t mode; err = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); @@ -598,7 +601,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, flags); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); if (err == -ENODATA && !size) err = 0; /* Removing non existed xattr. */ if (!err) @@ -616,7 +619,68 @@ out: int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type); + return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); +} + +static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, void *buffer, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + acl = ntfs_get_acl(inode, type, false); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (!acl) + return -ENODATA; + + err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); + posix_acl_release(acl); + + return err; +} + +static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, const void *value, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EPERM; + + if (!value) { + acl = NULL; + } else { + acl = posix_acl_from_xattr(mnt_userns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + err = posix_acl_valid(mnt_userns, acl); + if (err) + goto release_and_out; + } + } + + err = ntfs_set_acl(mnt_userns, inode, acl, type); + +release_and_out: + posix_acl_release(acl); + return err; } /* @@ -636,7 +700,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, if (default_acl) { err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, - ACL_TYPE_DEFAULT); + ACL_TYPE_DEFAULT, true); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; @@ -647,7 +711,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, else { if (!err) err = ntfs_set_acl_ex(mnt_userns, inode, acl, - ACL_TYPE_ACCESS); + ACL_TYPE_ACCESS, true); posix_acl_release(acl); } @@ -785,6 +849,23 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || + (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { + /* TODO: init_user_ns? */ + err = ntfs_xattr_get_acl( + &init_user_ns, inode, + name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + buffer, size); + goto out; + } +#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -897,10 +978,29 @@ set_new_fa: goto out; } +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || + (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { + err = ntfs_xattr_set_acl( + mnt_userns, inode, + name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + value, size); + goto out; + } +#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); out: + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + return err; } @@ -913,35 +1013,37 @@ int ntfs_save_wsl_perm(struct inode *inode) { int err; __le32 value; + struct ntfs_inode *ni = ntfs_i(inode); - /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */ + ni_lock(ni); value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); /* true == already locked. */ if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); if (err) goto out; } out: + ni_unlock(ni); /* In case of error should we delete all WSL xattr? */ return err; } diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index d442cf5dda8a..be5e9ed7da8d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -541,7 +541,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; struct dlm_lock_resource *oldres = dl->dl_res; - struct dlm_lock_resource *res = NULL; + struct dlm_lock_resource *res = NULL, *iter; struct list_head *track_list; spin_lock(&dlm->track_lock); @@ -556,11 +556,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } } - list_for_each_entry(res, track_list, tracking) { - if (&res->tracking == &dlm->tracking_list) - res = NULL; - else - dlm_lockres_get(res); + list_for_each_entry(iter, track_list, tracking) { + if (&iter->tracking != &dlm->tracking_list) { + dlm_lockres_get(iter); + res = iter; + } break; } spin_unlock(&dlm->track_lock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 61103b2d69fb..7318e4794ef9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -392,9 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock = NULL, *iter; enum dlm_status status = DLM_NORMAL; - int found = 0, i; + int i; struct dlm_lockstatus *lksb = NULL; int ignore; u32 flags; @@ -437,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } queue=&res->granted; - found = 0; spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { spin_unlock(&res->spinlock); @@ -461,21 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each_entry(lock, queue, list) { - if (lock->ml.cookie == unlock->cookie && - lock->ml.node == unlock->node_idx) { - dlm_lock_get(lock); - found = 1; + list_for_each_entry(iter, queue, list) { + if (iter->ml.cookie == unlock->cookie && + iter->ml.node == unlock->node_idx) { + dlm_lock_get(iter); + lock = iter; break; } } - if (found) + if (lock) break; /* scan granted -> converting -> blocked queues */ queue++; } spin_unlock(&res->spinlock); - if (!found) { + if (!lock) { status = DLM_IVLOCKID; goto not_found; } @@ -505,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_kick_thread(dlm, res); not_found: - if (!found) + if (!lock) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 29f183a15798..617c92e7b925 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -433,6 +433,11 @@ again: } spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + status = -EAGAIN; + goto bail; + } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, @@ -595,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); - return 0; + goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; @@ -609,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) } if (lockres->l_ro_holders || lockres->l_ex_holders) { + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + /* + * lock is never requested, leave USER_LOCK_IN_TEARDOWN set + * to avoid new lock request coming in. + */ spin_unlock(&lockres->l_lock); goto bail; } - lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 5739dc301569..bb116c39b581 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,6 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -171,11 +172,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (osb->journal) { + if (journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); - journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1887a2708709..fa87d89cf754 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,22 +810,20 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +/* + * alloc & initialize skeleton for journal structure. + * ocfs2_journal_init() will make fs have journal ability. + */ +int ocfs2_journal_alloc(struct ocfs2_super *osb) { - int status = -1; - struct inode *inode = NULL; /* the journal inode */ - journal_t *j_journal = NULL; - struct ocfs2_journal *journal = NULL; - struct ocfs2_dinode *di = NULL; - struct buffer_head *bh = NULL; - int inode_lock = 0; + int status = 0; + struct ocfs2_journal *journal; - /* initialize our journal structure */ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); if (!journal) { mlog(ML_ERROR, "unable to alloc journal\n"); status = -ENOMEM; - goto done; + goto bail; } osb->journal = journal; journal->j_osb = osb; @@ -839,6 +837,21 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; +bail: + return status; +} + +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_journal *journal = osb->journal; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + int inode_lock = 0; + + BUG_ON(!journal); /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->slot_num); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 8dcb2f2cadbc..969d0aa28718 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -154,6 +154,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. * + * ocfs2_journal_alloc - Initialize skeleton for journal structure. * ocfs2_journal_init - Initialize journal structures in the OSB. * ocfs2_journal_load - Load the given journal off disk. Replay it if * there's transactions still in there. @@ -167,6 +168,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_alloc(struct ocfs2_super *osb); int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c2..5022b3e9bfcd 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -921,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; - struct ocfs2_quota_chunk *chunk; + struct ocfs2_quota_chunk *chunk = NULL, *iter; struct ocfs2_local_disk_chunk *dchunk; int found = 0, len; - list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) - chunk->qc_headerbh->b_data; + iter->qc_headerbh->b_data; if (le32_to_cpu(dchunk->dqc_free) > 0) { - found = 1; + chunk = iter; break; } } - if (!found) + if (!chunk) return NULL; if (chunk->qc_num < oinfo->dqi_chunks - 1) { diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 769e466887b0..a9d1296d736d 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -198,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, resv->r_flags |= flags; } -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap) { memset(resmap, 0, sizeof(*resmap)); @@ -207,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb, resmap->m_reservations = RB_ROOT; /* m_bitmap_len is initialized to zero by the above memset. */ INIT_LIST_HEAD(&resmap->m_lru); - - return 0; } static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 677c50663595..ec8101ef5717 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -73,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @osb: struct ocfs2_super to be saved in resmap * @resmap: struct ocfs2_reservation_map to initialize - * @obj: unused for now - * @ops: unused for now - * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) - * - * Only possible return value other than '0' is -ENOMEM for failure to - * allocation mirror bitmap. */ -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap); /** diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 477cdf94122e..f7298816d8d9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -989,28 +989,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { status = -EINVAL; - goto read_super_error; + goto out; } /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); - goto read_super_error; + goto out; } status = ocfs2_initialize_super(sb, bh, sector_size, &stats); - osb = OCFS2_SB(sb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } brelse(bh); bh = NULL; + if (status < 0) + goto out; + + osb = OCFS2_SB(sb); if (!ocfs2_check_set_options(sb, &parsed_options)) { status = -EINVAL; - goto read_super_error; + goto out_super; } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; @@ -1027,7 +1026,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) - goto read_super_error; + goto out_super; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -1041,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); - goto read_super_error; + goto out_super; } /* You should not be able to start a local heartbeat @@ -1050,7 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EROFS; mlog(ML_ERROR, "Local heartbeat specified on readonly " "device.\n"); - goto read_super_error; + goto out_super; } status = ocfs2_check_journals_nolocks(osb); @@ -1059,9 +1058,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog(ML_ERROR, "Recovery required on readonly " "file system, but write access is " "unavailable.\n"); - else - mlog_errno(status); - goto read_super_error; + goto out_super; } ocfs2_set_ro_flag(osb, 1); @@ -1077,10 +1074,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } status = ocfs2_verify_heartbeat(osb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } + if (status < 0) + goto out_super; osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); @@ -1094,15 +1089,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_mount_volume(sb); if (status < 0) - goto read_super_error; + goto out_debugfs; if (osb->root_inode) inode = igrab(osb->root_inode); if (!inode) { status = -EIO; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, @@ -1110,7 +1104,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!osb->osb_dev_kset) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } /* Create filecheck sysfs related directories/files at @@ -1119,14 +1113,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -ENOMEM; mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } root = d_make_root(inode); if (!root) { status = -ENOMEM; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } sb->s_root = root; @@ -1178,17 +1171,21 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; -read_super_error: - brelse(bh); - - if (status) - mlog_errno(status); +out_dismount: + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + goto out; - if (osb) { - atomic_set(&osb->vol_state, VOLUME_DISABLED); - wake_up(&osb->osb_mount_event); - ocfs2_dismount_volume(sb, 1); - } +out_debugfs: + debugfs_remove_recursive(osb->osb_debug_root); +out_super: + ocfs2_release_system_inodes(osb); + kfree(osb->recovery_map); + ocfs2_delete_osb(osb); + kfree(osb); +out: + mlog_errno(status); return status; } @@ -1803,11 +1800,10 @@ static int ocfs2_get_sector(struct super_block *sb, static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; - int unlock_super = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) - goto leave; + goto out; mutex_init(&osb->obs_trim_fs_mutex); @@ -1817,44 +1813,56 @@ static int ocfs2_mount_volume(struct super_block *sb) if (status == -EBADR && ocfs2_userspace_stack(osb)) mlog(ML_ERROR, "couldn't mount because cluster name on" " disk does not match the running cluster name.\n"); - goto leave; + goto out; } status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); - goto leave; + goto out_dlm; } - unlock_super = 1; /* This will load up the node map and add ourselves to it. */ status = ocfs2_find_slot(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } status = ocfs2_check_volume(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_system_inodes; } status = ocfs2_truncate_log_init(osb); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out_system_inodes; + } -leave: - if (unlock_super) - ocfs2_super_unlock(osb, 1); + ocfs2_super_unlock(osb, 1); + return 0; +out_system_inodes: + if (osb->local_alloc_state == OCFS2_LA_ENABLED) + ocfs2_shutdown_local_alloc(osb); + ocfs2_release_system_inodes(osb); + /* before journal shutdown, we should release slot_info */ + ocfs2_free_slot_info(osb); + ocfs2_journal_shutdown(osb); +out_super_lock: + ocfs2_super_unlock(osb, 1); +out_dlm: + ocfs2_dlm_shutdown(osb, 0); +out: return status; } @@ -2022,7 +2030,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out; } sb->s_fs_info = osb; @@ -2083,7 +2091,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Invalid number of node slots (%u)\n", osb->max_slots); status = -EINVAL; - goto bail; + goto out; } ocfs2_orphan_scan_init(osb); @@ -2092,7 +2100,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); mlog_errno(status); - goto bail; + goto out; } init_waitqueue_head(&osb->checkpoint_event); @@ -2110,17 +2118,13 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); - status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); - if (status) { - mlog_errno(status); - goto bail; - } + ocfs2_resmap_init(osb, &osb->osb_la_resmap); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); status = -ENOMEM; - goto bail; + goto out_recovery_map; } osb->slot_recovery_generations = @@ -2129,7 +2133,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->slot_recovery_generations) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_vol_label; } init_waitqueue_head(&osb->osb_wipe_event); @@ -2139,7 +2143,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_orphan_wipes) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_slot_recovery_gen; } osb->osb_rf_lock_tree = RB_ROOT; @@ -2155,13 +2159,13 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "couldn't mount because of unsupported " "optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (ocfs2_clusterinfo_valid(osb)) { @@ -2182,7 +2186,7 @@ static int ocfs2_initialize_super(struct super_block *sb, "cluster stack label (%s) \n", osb->osb_cluster_stack); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, @@ -2195,6 +2199,15 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); + /* + * FIXME + * This should be done in ocfs2_journal_init(), but any inode + * writes back operation will cause the filesystem to crash. + */ + status = ocfs2_journal_alloc(osb); + if (status < 0) + goto out_orphan_wipes; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2208,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", osb->s_clustersize); status = -EINVAL; - goto bail; + goto out_journal; } total_blocks = ocfs2_clusters_to_blocks(osb->sb, @@ -2220,14 +2233,14 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume too large " "to mount safely on this system"); status = -EFBIG; - goto bail; + goto out_journal; } if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid))) { mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); status = -ENOMEM; - goto bail; + goto out_journal; } strlcpy(osb->vol_label, di->id2.i_super.s_label, @@ -2247,7 +2260,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_dlm_debug) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_uuid_str; } atomic_set(&osb->vol_state, VOLUME_INIT); @@ -2256,7 +2269,7 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_global_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_dlm_out; } /* @@ -2267,7 +2280,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!inode) { status = -EINVAL; mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; @@ -2280,16 +2293,39 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_slot_info(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); + goto out_slot_info; } -bail: + return status; + +out_slot_info: + ocfs2_free_slot_info(osb); +out_system_inodes: + ocfs2_release_system_inodes(osb); +out_dlm_out: + ocfs2_put_dlm_debug(osb->osb_dlm_debug); +out_uuid_str: + kfree(osb->uuid_str); +out_journal: + kfree(osb->journal); +out_orphan_wipes: + kfree(osb->osb_orphan_wipes); +out_slot_recovery_gen: + kfree(osb->slot_recovery_generations); +out_vol_label: + kfree(osb->vol_label); +out_recovery_map: + kfree(osb->recovery_map); +out: + kfree(osb); + sb->s_fs_info = NULL; return status; } @@ -2483,6 +2519,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the middle of ocfs2_initialize_super(), + * we free it here. + */ + kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); diff --git a/fs/open.c b/fs/open.c index 1315253e0247..1d57fbde2feb 100644 --- a/fs/open.c +++ b/fs/open.c @@ -224,6 +224,21 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) } #endif /* BITS_PER_LONG == 32 */ +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) +COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, + compat_arg_u64_dual(length)) +{ + return ksys_truncate(pathname, compat_arg_u64_glue(length)); +} +#endif + +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) +COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, + compat_arg_u64_dual(length)) +{ + return ksys_ftruncate(fd, compat_arg_u64_glue(length)); +} +#endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -339,6 +354,15 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return ksys_fallocate(fd, mode, offset, len); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) +COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), + compat_arg_u64_dual(len)) +{ + return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), + compat_arg_u64_glue(len)); +} +#endif + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and @@ -834,16 +858,15 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) + f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page @@ -981,6 +1004,48 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * dentry_create - Create and open a file + * @path: path to create + * @flags: O_ flags + * @mode: mode bits for new file + * @cred: credentials to use + * + * Caller must hold the parent directory's lock, and have prepared + * a negative dentry, placed in @path->dentry, for the new file. + * + * Caller sets @path->mnt to the vfsmount of the filesystem where + * the new file is to be created. The parent directory and the + * negative dentry must reside on the same filesystem instance. + * + * On success, returns a "struct file *". Otherwise a ERR_PTR + * is returned. + */ +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred) +{ + struct file *f; + int error; + + validate_creds(cred); + f = alloc_empty_file(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_create(mnt_user_ns(path->mnt), + d_inode(path->dentry->d_parent), + path->dentry, mode, true); + if (!error) + error = vfs_open(path, f); + + if (unlikely(error)) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(dentry_create); + struct file *open_with_fake_path(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index e040970408d4..714ec569d25b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -44,9 +44,9 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -int ovl_copy_xattr(struct super_block *sb, struct dentry *old, - struct dentry *new) +int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new) { + struct dentry *old = oldpath->dentry; ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int error = 0; @@ -94,9 +94,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, continue; /* Discard */ } retry: - size = vfs_getxattr(&init_user_ns, old, name, value, value_size); + size = ovl_do_getxattr(oldpath, name, value, value_size); if (size == -ERANGE) - size = vfs_getxattr(&init_user_ns, old, name, NULL, 0); + size = ovl_do_getxattr(oldpath, name, NULL, 0); if (size < 0) { error = size; @@ -117,7 +117,7 @@ retry: goto retry; } - error = vfs_setxattr(&init_user_ns, new, name, value, size, 0); + error = ovl_do_setxattr(OVL_FS(sb), new, name, value, size, 0); if (error) { if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) break; @@ -292,17 +292,19 @@ out_fput: return error; } -static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat) +static int ovl_set_size(struct ovl_fs *ofs, + struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { .ia_valid = ATTR_SIZE, .ia_size = stat->size, }; - return notify_change(&init_user_ns, upperdentry, &attr, NULL); + return ovl_do_notify_change(ofs, upperdentry, &attr); } -static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, + struct kstat *stat) { struct iattr attr = { .ia_valid = @@ -311,10 +313,11 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) .ia_mtime = stat->mtime, }; - return notify_change(&init_user_ns, upperdentry, &attr, NULL); + return ovl_do_notify_change(ofs, upperdentry, &attr); } -int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, + struct kstat *stat) { int err = 0; @@ -323,7 +326,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_valid = ATTR_MODE, .ia_mode = stat->mode, }; - err = notify_change(&init_user_ns, upperdentry, &attr, NULL); + err = ovl_do_notify_change(ofs, upperdentry, &attr); } if (!err) { struct iattr attr = { @@ -331,10 +334,10 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_uid = stat->uid, .ia_gid = stat->gid, }; - err = notify_change(&init_user_ns, upperdentry, &attr, NULL); + err = ovl_do_notify_change(ofs, upperdentry, &attr); } if (!err) - ovl_set_timestamps(upperdentry, stat); + ovl_set_timestamps(ofs, upperdentry, stat); return err; } @@ -433,7 +436,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len); + err = ovl_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len); kfree(fh); return err; @@ -474,7 +477,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (err) return err; - temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0)); + temp = ovl_create_temp(ofs, indexdir, OVL_CATTR(S_IFDIR | 0)); err = PTR_ERR(temp); if (IS_ERR(temp)) goto free_name; @@ -483,16 +486,16 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (err) goto out; - index = lookup_one_len(name.name, indexdir, name.len); + index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); } else { - err = ovl_do_rename(dir, temp, dir, index, 0); + err = ovl_do_rename(ofs, dir, temp, dir, index, 0); dput(index); } out: if (err) - ovl_cleanup(dir, temp); + ovl_cleanup(ofs, dir, temp); dput(temp); free_name: kfree(name.name); @@ -519,6 +522,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) int err; struct dentry *upper; struct dentry *upperdir = ovl_dentry_upper(c->parent); + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(upperdir); /* Mark parent "impure" because it may now contain non-pure upper */ @@ -531,16 +535,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; inode_lock_nested(udir, I_MUTEX_PARENT); - upper = lookup_one_len(c->dentry->d_name.name, upperdir, - c->dentry->d_name.len); + upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, + c->dentry->d_name.len); err = PTR_ERR(upper); if (!IS_ERR(upper)) { - err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper); + err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); dput(upper); if (!err) { /* Restore timestamps on parent (best effort) */ - ovl_set_timestamps(upperdir, &c->pstat); + ovl_set_timestamps(ofs, upperdir, &c->pstat); ovl_dentry_set_upper_alias(c->dentry); } } @@ -578,7 +582,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - err = ovl_copy_xattr(c->dentry->d_sb, c->lowerpath.dentry, temp); + err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); if (err) return err; @@ -614,9 +618,9 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) inode_lock(temp->d_inode); if (S_ISREG(c->stat.mode)) - err = ovl_set_size(temp, &c->stat); + err = ovl_set_size(ofs, temp, &c->stat); if (!err) - err = ovl_set_attr(temp, &c->stat); + err = ovl_set_attr(ofs, temp, &c->stat); inode_unlock(temp->d_inode); return err; @@ -656,6 +660,7 @@ static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) */ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); struct dentry *temp, *upper; @@ -677,7 +682,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (err) goto unlock; - temp = ovl_create_temp(c->workdir, &cattr); + temp = ovl_create_temp(ofs, c->workdir, &cattr); ovl_revert_cu_creds(&cc); err = PTR_ERR(temp); @@ -694,12 +699,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) goto cleanup; } - upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, + c->destname.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto cleanup; - err = ovl_do_rename(wdir, temp, udir, upper, 0); + err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0); dput(upper); if (err) goto cleanup; @@ -716,7 +722,7 @@ unlock: return err; cleanup: - ovl_cleanup(wdir, temp); + ovl_cleanup(ofs, wdir, temp); dput(temp); goto unlock; } @@ -724,6 +730,7 @@ cleanup: /* Copyup using O_TMPFILE which does not require cross dir locking */ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) { + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; struct ovl_cu_creds cc; @@ -733,7 +740,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; - temp = ovl_do_tmpfile(c->workdir, c->stat.mode); + temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); ovl_revert_cu_creds(&cc); if (IS_ERR(temp)) @@ -745,10 +752,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_lock_nested(udir, I_MUTEX_PARENT); - upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, + c->destname.len); err = PTR_ERR(upper); if (!IS_ERR(upper)) { - err = ovl_do_link(temp, udir, upper); + err = ovl_do_link(ofs, temp, udir, upper); dput(upper); } inode_unlock(udir); @@ -836,7 +844,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) /* Restore timestamps on parent (best effort) */ inode_lock(udir); - ovl_set_timestamps(c->destdir, &c->pstat); + ovl_set_timestamps(ofs, c->destdir, &c->pstat); inode_unlock(udir); ovl_dentry_set_upper_alias(c->dentry); @@ -865,12 +873,12 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, return true; } -static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) +static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) { ssize_t res; char *buf; - res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); + res = ovl_do_getxattr(path, name, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) res = 0; @@ -879,7 +887,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) if (!buf) return -ENOMEM; - res = vfs_getxattr(&init_user_ns, dentry, name, buf, res); + res = ovl_do_getxattr(path, name, buf, res); if (res < 0) kfree(buf); else @@ -906,8 +914,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) return -EIO; if (c->stat.size) { - err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, - &capability); + err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, + &capability); if (cap_size < 0) goto out; } @@ -921,14 +929,14 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * don't want that to happen for normal copy-up operation. */ if (capability) { - err = vfs_setxattr(&init_user_ns, upperpath.dentry, - XATTR_NAME_CAPS, capability, cap_size, 0); + err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); if (err) goto out_free; } - err = ovl_do_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); + err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); if (err) goto out_free; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f18490813170..6b03457f72bb 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -23,15 +23,15 @@ MODULE_PARM_DESC(redirect_max, static int ovl_set_redirect(struct dentry *dentry, bool samedir); -int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry) { int err; dget(wdentry); if (d_is_dir(wdentry)) - err = ovl_do_rmdir(wdir, wdentry); + err = ovl_do_rmdir(ofs, wdir, wdentry); else - err = ovl_do_unlink(wdir, wdentry); + err = ovl_do_unlink(ofs, wdir, wdentry); dput(wdentry); if (err) { @@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) return err; } -struct dentry *ovl_lookup_temp(struct dentry *workdir) +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -51,7 +51,7 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir) /* counter is allowed to wrap, since temp dentries are ephemeral */ snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); - temp = lookup_one_len(name, workdir, strlen(name)); + temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { pr_err("workdir/%s already exists\n", name); dput(temp); @@ -70,11 +70,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) struct inode *wdir = workdir->d_inode; if (!ofs->whiteout) { - whiteout = ovl_lookup_temp(workdir); + whiteout = ovl_lookup_temp(ofs, workdir); if (IS_ERR(whiteout)) goto out; - err = ovl_do_whiteout(wdir, whiteout); + err = ovl_do_whiteout(ofs, wdir, whiteout); if (err) { dput(whiteout); whiteout = ERR_PTR(err); @@ -84,11 +84,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) } if (ofs->share_whiteout) { - whiteout = ovl_lookup_temp(workdir); + whiteout = ovl_lookup_temp(ofs, workdir); if (IS_ERR(whiteout)) goto out; - err = ovl_do_link(ofs->whiteout, wdir, whiteout); + err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); if (!err) goto out; @@ -122,27 +122,28 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_do_rename(wdir, whiteout, dir, dentry, flags); + err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags); if (err) goto kill_whiteout; if (flags) - ovl_cleanup(wdir, dentry); + ovl_cleanup(ofs, wdir, dentry); out: dput(whiteout); return err; kill_whiteout: - ovl_cleanup(wdir, whiteout); + ovl_cleanup(ofs, wdir, whiteout); goto out; } -int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) +int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry **newdentry, umode_t mode) { int err; struct dentry *d, *dentry = *newdentry; - err = ovl_do_mkdir(dir, dentry, mode); + err = ovl_do_mkdir(ofs, dir, dentry, mode); if (err) return err; @@ -154,8 +155,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) * to it unhashed and negative. If that happens, try to * lookup a new hashed and positive dentry. */ - d = lookup_one_len(dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); + d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); if (IS_ERR(d)) { pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", dentry, err); @@ -167,8 +168,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) return 0; } -struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct ovl_cattr *attr) +struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry *newdentry, struct ovl_cattr *attr) { int err; @@ -180,28 +181,28 @@ struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, goto out; if (attr->hardlink) { - err = ovl_do_link(attr->hardlink, dir, newdentry); + err = ovl_do_link(ofs, attr->hardlink, dir, newdentry); } else { switch (attr->mode & S_IFMT) { case S_IFREG: - err = ovl_do_create(dir, newdentry, attr->mode); + err = ovl_do_create(ofs, dir, newdentry, attr->mode); break; case S_IFDIR: /* mkdir is special... */ - err = ovl_mkdir_real(dir, &newdentry, attr->mode); + err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - err = ovl_do_mknod(dir, newdentry, attr->mode, + err = ovl_do_mknod(ofs, dir, newdentry, attr->mode, attr->rdev); break; case S_IFLNK: - err = ovl_do_symlink(dir, newdentry, attr->link); + err = ovl_do_symlink(ofs, dir, newdentry, attr->link); break; default: @@ -223,10 +224,11 @@ out: return newdentry; } -struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) +struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, + struct ovl_cattr *attr) { - return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir), - attr); + return ovl_create_real(ofs, d_inode(workdir), + ovl_lookup_temp(ofs, workdir), attr); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -330,10 +332,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, attr->mode &= ~current_umask(); inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = ovl_create_real(udir, - lookup_one_len(dentry->d_name.name, - upperdir, - dentry->d_name.len), + newdentry = ovl_create_real(ofs, udir, + ovl_lookup_upper(ofs, dentry->d_name.name, + upperdir, dentry->d_name.len), attr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) @@ -353,7 +354,7 @@ out_unlock: return err; out_cleanup: - ovl_cleanup(udir, newdentry); + ovl_cleanup(ofs, udir, newdentry); dput(newdentry); goto out_unlock; } @@ -361,6 +362,7 @@ out_cleanup: static struct dentry *ovl_clear_empty(struct dentry *dentry, struct list_head *list) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); @@ -391,12 +393,12 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode)); + opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode)); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; - err = ovl_copy_xattr(dentry->d_sb, upper, opaquedir); + err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir); if (err) goto out_cleanup; @@ -405,17 +407,17 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, goto out_cleanup; inode_lock(opaquedir->d_inode); - err = ovl_set_attr(opaquedir, &stat); + err = ovl_set_attr(ofs, opaquedir, &stat); inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; - err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; - ovl_cleanup_whiteouts(upper, list); - ovl_cleanup(wdir, upper); + ovl_cleanup_whiteouts(ofs, upper, list); + ovl_cleanup(ofs, wdir, upper); unlock_rename(workdir, upperdir); /* dentry's upper doesn't match now, get rid of it */ @@ -424,7 +426,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, return opaquedir; out_cleanup: - ovl_cleanup(wdir, opaquedir); + ovl_cleanup(ofs, wdir, opaquedir); dput(opaquedir); out_unlock: unlock_rename(workdir, upperdir); @@ -432,8 +434,8 @@ out: return ERR_PTR(err); } -static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, - const struct posix_acl *acl) +static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry, + const char *name, const struct posix_acl *acl) { void *buffer; size_t size; @@ -451,7 +453,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, if (err < 0) goto out_free; - err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE); + err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE); out_free: kfree(buffer); return err; @@ -460,6 +462,7 @@ out_free: static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct ovl_cattr *cattr) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); @@ -484,8 +487,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto out_unlock; @@ -494,7 +497,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) goto out_dput; - newdentry = ovl_create_temp(workdir, cattr); + newdentry = ovl_create_temp(ofs, workdir, cattr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_dput; @@ -510,19 +513,19 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, .ia_mode = cattr->mode, }; inode_lock(newdentry->d_inode); - err = notify_change(&init_user_ns, newdentry, &attr, NULL); + err = ovl_do_notify_change(ofs, newdentry, &attr); inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; } if (!hardlink) { - err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS, - acl); + err = ovl_set_upper_acl(ofs, newdentry, + XATTR_NAME_POSIX_ACL_ACCESS, acl); if (err) goto out_cleanup; - err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT, - default_acl); + err = ovl_set_upper_acl(ofs, newdentry, + XATTR_NAME_POSIX_ACL_DEFAULT, default_acl); if (err) goto out_cleanup; } @@ -532,20 +535,20 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(wdir, newdentry, udir, upper, + err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; - ovl_cleanup(wdir, upper); + ovl_cleanup(ofs, wdir, upper); } else { - err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0); if (err) goto out_cleanup; } err = ovl_instantiate(dentry, inode, newdentry, hardlink); if (err) { - ovl_cleanup(udir, newdentry); + ovl_cleanup(ofs, udir, newdentry); dput(newdentry); } out_dput: @@ -560,7 +563,7 @@ out: return err; out_cleanup: - ovl_cleanup(wdir, newdentry); + ovl_cleanup(ofs, wdir, newdentry); dput(newdentry); goto out_dput; } @@ -767,8 +770,8 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, if (err) goto out_dput; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto out_unlock; @@ -800,6 +803,7 @@ out: static int ovl_remove_upper(struct dentry *dentry, bool is_dir, struct list_head *list) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *dir = upperdir->d_inode; struct dentry *upper; @@ -814,8 +818,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, } inode_lock_nested(dir, I_MUTEX_PARENT); - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto out_unlock; @@ -826,9 +830,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out_dput_upper; if (is_dir) - err = vfs_rmdir(&init_user_ns, dir, upper); + err = ovl_do_rmdir(ofs, dir, upper); else - err = vfs_unlink(&init_user_ns, dir, upper, NULL); + err = ovl_do_unlink(ofs, dir, upper); ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); /* @@ -880,7 +884,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; const struct cred *old_cred; - struct dentry *upperdentry; bool lower_positive = ovl_lower_positive(dentry); LIST_HEAD(list); @@ -923,9 +926,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) * Note: we fail to update ctime if there was no copy-up, only a * whiteout */ - upperdentry = ovl_dentry_upper(dentry); - if (upperdentry) - ovl_copyattr(d_inode(upperdentry), d_inode(dentry)); + if (ovl_dentry_upper(dentry)) + ovl_copyattr(d_inode(dentry)); out_drop_write: ovl_drop_write(dentry); @@ -1095,6 +1097,7 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, bool samedir = olddir == newdir; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; + struct ovl_fs *ofs = OVL_FS(old->d_sb); LIST_HEAD(list); err = -EINVAL; @@ -1189,8 +1192,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, trap = lock_rename(new_upperdir, old_upperdir); - olddentry = lookup_one_len(old->d_name.name, old_upperdir, - old->d_name.len); + olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, + old->d_name.len); err = PTR_ERR(olddentry); if (IS_ERR(olddentry)) goto out_unlock; @@ -1199,8 +1202,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, if (!ovl_matches_upper(old, olddentry)) goto out_dput_old; - newdentry = lookup_one_len(new->d_name.name, new_upperdir, - new->d_name.len); + newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, + new->d_name.len); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_dput_old; @@ -1251,13 +1254,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, if (err) goto out_dput; - err = ovl_do_rename(old_upperdir->d_inode, olddentry, + err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry, new_upperdir->d_inode, newdentry, flags); if (err) goto out_dput; if (cleanup_whiteout) - ovl_cleanup(old_upperdir->d_inode, newdentry); + ovl_cleanup(ofs, old_upperdir->d_inode, newdentry); if (overwrite && d_inode(new)) { if (new_is_dir) @@ -1272,9 +1275,9 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, (d_inode(new) && ovl_type_origin(new))); /* copy ctime: */ - ovl_copyattr(d_inode(olddentry), d_inode(old)); + ovl_copyattr(d_inode(old)); if (d_inode(new) && ovl_dentry_upper(new)) - ovl_copyattr(d_inode(newdentry), d_inode(new)); + ovl_copyattr(d_inode(new)); out_dput: dput(newdentry); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index ebde05c9cf62..2eada97bbd23 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -391,6 +391,11 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, * pointer because we hold no lock on the real dentry. */ take_dentry_name_snapshot(&name, real); + /* + * No mnt_userns handling here: it's an internal lookup. Could skip + * permission checking altogether, but for now just use non-mnt_userns + * transformed ids. + */ this = lookup_one_len(name.name.name, connected, name.name.len); release_dentry_name_snapshot(&name); err = PTR_ERR(this); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..daff601b5c41 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -38,9 +38,11 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) static struct file *ovl_open_realfile(const struct file *file, - struct inode *realinode) + struct path *realpath) { + struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); + struct user_namespace *real_mnt_userns; struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; @@ -51,11 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode); + real_mnt_userns = mnt_user_ns(realpath->mnt); + err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(&init_user_ns, realinode)) + if (!inode_owner_or_capable(real_mnt_userns, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, @@ -82,11 +85,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags) if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; - if (flags & O_DIRECT) { - if (!file->f_mapping->a_ops || - !file->f_mapping->a_ops->direct_IO) - return -EINVAL; - } + if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; if (file->f_op->check_flags) { err = file->f_op->check_flags(flags); @@ -104,21 +104,21 @@ static int ovl_change_flags(struct file *file, unsigned int flags) static int ovl_real_fdget_meta(const struct file *file, struct fd *real, bool allow_meta) { - struct inode *inode = file_inode(file); - struct inode *realinode; + struct dentry *dentry = file_dentry(file); + struct path realpath; real->flags = 0; real->file = file->private_data; if (allow_meta) - realinode = ovl_inode_real(inode); + ovl_path_real(dentry, &realpath); else - realinode = ovl_inode_realdata(inode); + ovl_path_realdata(dentry, &realpath); /* Has it been copied up since we'd opened it? */ - if (unlikely(file_inode(real->file) != realinode)) { + if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) { real->flags = FDPUT_FPUT; - real->file = ovl_open_realfile(file, realinode); + real->file = ovl_open_realfile(file, &realpath); return PTR_ERR_OR_ZERO(real->file); } @@ -144,17 +144,20 @@ static int ovl_real_fdget(const struct file *file, struct fd *real) static int ovl_open(struct inode *inode, struct file *file) { + struct dentry *dentry = file_dentry(file); struct file *realfile; + struct path realpath; int err; - err = ovl_maybe_copy_up(file_dentry(file), file->f_flags); + err = ovl_maybe_copy_up(dentry, file->f_flags); if (err) return err; /* No longer need these flags, so don't pass them on to underlying fs */ file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - realfile = ovl_open_realfile(file, ovl_inode_realdata(inode)); + ovl_path_realdata(dentry, &realpath); + realfile = ovl_open_realfile(file, &realpath); if (IS_ERR(realfile)) return PTR_ERR(realfile); @@ -273,7 +276,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, SB_FREEZE_WRITE); file_end_write(iocb->ki_filp); - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); } orig_iocb->ki_pos = iocb->ki_pos; @@ -306,8 +309,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; old_cred = ovl_override_creds(file_inode(file)->i_sb); @@ -356,7 +358,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); ret = file_remove_privs(file); if (ret) goto out_unlock; @@ -367,8 +369,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ret = -EINVAL; if (iocb->ki_flags & IOCB_DIRECT && - (!real.file->f_mapping->a_ops || - !real.file->f_mapping->a_ops->direct_IO)) + !(real.file->f_mode & FMODE_CAN_ODIRECT)) goto out_fdput; if (!ovl_should_sync(OVL_FS(inode->i_sb))) @@ -381,7 +382,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ovl_iocb_to_rwf(ifl)); file_end_write(real.file); /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); } else { struct ovl_aio_req *aio_req; @@ -431,12 +432,11 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, struct fd real; const struct cred *old_cred; struct inode *inode = file_inode(out); - struct inode *realinode = ovl_inode_real(inode); ssize_t ret; inode_lock(inode); /* Update mode */ - ovl_copyattr(realinode, inode); + ovl_copyattr(inode); ret = file_remove_privs(out); if (ret) goto out_unlock; @@ -452,7 +452,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, file_end_write(real.file); /* Update size */ - ovl_copyattr(realinode, inode); + ovl_copyattr(inode); revert_creds(old_cred); fdput(real); @@ -526,7 +526,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len revert_creds(old_cred); /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); fdput(real); @@ -598,7 +598,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, revert_creds(old_cred); /* Update size */ - ovl_copyattr(ovl_inode_real(inode_out), inode_out); + ovl_copyattr(inode_out); fdput(real_in); fdput(real_out); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 1f36158c7dbe..492eddeb481f 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -21,6 +21,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool full_copy_up = false; struct dentry *upperdentry; const struct cred *old_cred; @@ -77,10 +78,10 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); - err = notify_change(&init_user_ns, upperdentry, attr, NULL); + err = ovl_do_notify_change(ofs, upperdentry, attr); revert_creds(old_cred); if (!err) - ovl_copyattr(upperdentry->d_inode, dentry->d_inode); + ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); if (winode) @@ -279,12 +280,14 @@ int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); - struct inode *realinode = upperinode ?: ovl_inode_lower(inode); + struct inode *realinode; + struct path realpath; const struct cred *old_cred; int err; /* Careful in RCU walk mode */ - if (!realinode) { + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { WARN_ON(!(mask & MAY_NOT_BLOCK)); return -ECHILD; } @@ -297,6 +300,7 @@ int ovl_permission(struct user_namespace *mnt_userns, if (err) return err; + realinode = d_inode(realpath.dentry); old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { @@ -304,7 +308,7 @@ int ovl_permission(struct user_namespace *mnt_userns, /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(&init_user_ns, realinode, mask); + err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask); revert_creds(old_cred); return err; @@ -342,8 +346,10 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_i_dentry_upper(inode); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + struct path realpath; const struct cred *old_cred; err = ovl_want_write(dentry); @@ -351,8 +357,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, goto out; if (!value && !upperdentry) { + ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); + err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -367,17 +374,17 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, } old_cred = ovl_override_creds(dentry->d_sb); - if (value) - err = vfs_setxattr(&init_user_ns, realdentry, name, value, size, - flags); - else { + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, + flags); + } else { WARN_ON(flags != XATTR_REPLACE); - err = vfs_removexattr(&init_user_ns, realdentry, name); + err = ovl_do_removexattr(ofs, realdentry, name); } revert_creds(old_cred); /* copy c/mtime */ - ovl_copyattr(d_inode(realdentry), inode); + ovl_copyattr(inode); out_drop_write: ovl_drop_write(dentry); @@ -390,11 +397,11 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, { ssize_t res; const struct cred *old_cred; - struct dentry *realdentry = - ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); + struct path realpath; + ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); + res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size); revert_creds(old_cred); return res; } @@ -535,7 +542,7 @@ int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa) if (err) return err; - return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa); + return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa); } int ovl_fileattr_set(struct user_namespace *mnt_userns, @@ -579,7 +586,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); /* Update ctime */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); } ovl_drop_write(dentry); out: @@ -777,16 +784,19 @@ void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, unsigned long ino, int fsid) { struct inode *realinode; + struct ovl_inode *oi = OVL_I(inode); if (oip->upperdentry) - OVL_I(inode)->__upperdentry = oip->upperdentry; - if (oip->lowerpath && oip->lowerpath->dentry) - OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry)); + oi->__upperdentry = oip->upperdentry; + if (oip->lowerpath && oip->lowerpath->dentry) { + oi->lowerpath.dentry = dget(oip->lowerpath->dentry); + oi->lowerpath.layer = oip->lowerpath->layer; + } if (oip->lowerdata) - OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata)); + oi->lowerdata = igrab(d_inode(oip->lowerdata)); realinode = ovl_inode_real(inode); - ovl_copyattr(realinode, inode); + ovl_copyattr(inode); ovl_copyflags(realinode, inode); ovl_map_ino(inode, ino, fsid); } @@ -871,8 +881,8 @@ static int ovl_set_nlink_common(struct dentry *dentry, if (WARN_ON(len >= sizeof(buf))) return -EIO; - return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), - OVL_XATTR_NLINK, buf, len); + return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), + OVL_XATTR_NLINK, buf, len); } int ovl_set_nlink_upper(struct dentry *dentry) @@ -897,8 +907,8 @@ unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) return fallback; - err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK, - &buf, sizeof(buf) - 1); + err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, + &buf, sizeof(buf) - 1); if (err < 0) goto fail; @@ -1102,6 +1112,10 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; + struct path realpath = { + .dentry = upperdentry ?: lowerdentry, + .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, + }; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); int fsid = bylower ? lowerpath->layer->fsid : 0; @@ -1175,7 +1189,7 @@ struct inode *ovl_get_inode(struct super_block *sb, /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || oip->numlower > 1) || - ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) { + ovl_path_check_origin_xattr(ofs, &realpath)) { ovl_set_flag(OVL_WHITEOUTS, inode); } } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 1a9b515fc45d..65c4346a5b43 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -16,6 +16,7 @@ struct ovl_lookup_data { struct super_block *sb; + struct vfsmount *mnt; struct qstr name; bool is_dir; bool opaque; @@ -25,14 +26,14 @@ struct ovl_lookup_data { bool metacopy; }; -static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, +static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; char *buf; struct ovl_fs *ofs = OVL_FS(d->sb); - buf = ovl_get_redirect_xattr(ofs, dentry, prelen + strlen(post)); + buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); @@ -105,13 +106,13 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) return 0; } -static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry, +static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox) { int res, err; struct ovl_fh *fh = NULL; - res = ovl_do_getxattr(ofs, dentry, ox, NULL, 0); + res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; @@ -125,7 +126,7 @@ static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry, if (!fh) return ERR_PTR(-ENOMEM); - res = ovl_do_getxattr(ofs, dentry, ox, fh->buf, res); + res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res); if (res < 0) goto fail; @@ -193,16 +194,17 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct super_block *sb, struct dentry *dentry) +static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path) { - return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_OPAQUE); + return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); } -static struct dentry *ovl_lookup_positive_unlocked(const char *name, +static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, + const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_len_unlocked(name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -224,10 +226,11 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret, bool drop_negative) { struct dentry *this; + struct path path; int err; bool last_element = !post[0]; - this = ovl_lookup_positive_unlocked(name, base, namelen, drop_negative); + this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -253,12 +256,15 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } + + path.dentry = this; + path.mnt = d->mnt; if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), this); + err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path); if (err < 0) goto out_err; @@ -278,14 +284,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(d->sb, this)) { + if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { d->stop = true; if (last_element) d->opaque = true; goto out; } } - err = ovl_check_redirect(this, d, prelen, post); + err = ovl_check_redirect(&path, d, prelen, post); if (err) goto out_err; out: @@ -464,7 +470,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, err = ovl_verify_fh(ofs, dentry, ox, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); if (err) goto fail; @@ -704,7 +710,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name, + ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -856,6 +863,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { + d.mnt = ovl_upper_mnt(ofs); err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -911,6 +919,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else d.last = lower.layer->idx == roe->numlower; + d.mnt = lower.layer->mnt; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1071,14 +1080,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperdentry) ovl_dentry_set_upper_alias(dentry); else if (index) { - upperdentry = dget(index); - upperredirect = ovl_get_redirect_xattr(ofs, upperdentry, 0); + struct path upperpath = { + .dentry = upperdentry = dget(index), + .mnt = ovl_upper_mnt(ofs), + }; + + upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); upperredirect = NULL; goto out_free_oe; } - err = ovl_check_metacopy_xattr(ofs, upperdentry); + err = ovl_check_metacopy_xattr(ofs, &upperpath); if (err < 0) goto out_free_oe; uppermetacopy = err; @@ -1163,8 +1176,8 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_positive_unlocked(name->name, lowerdir, - name->len); + this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt), + name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 2cd5741c873b..4f34b7e02eee 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/fs.h> +#include <linux/namei.h> #include "ovl_entry.h" #undef pr_fmt @@ -122,109 +123,180 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) return ovl_xattr_table[ox][ofs->config.userxattr]; } -static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +/* + * When changing ownership of an upper object map the intended ownership + * according to the upper layer's idmapping. When an upper mount idmaps files + * that are stored on-disk as owned by id 1001 to id 1000 this means stat on + * this object will report it as being owned by id 1000 when calling stat via + * the upper mount. + * In order to change ownership of an object so stat reports id 1000 when + * called on an idmapped upper mount the value written to disk - i.e., the + * value stored in ia_*id - must 1001. The mount mapping helper will thus take + * care to map 1000 to 1001. + * The mnt idmapping helpers are nops if the upper layer isn't idmapped. + */ +static inline int ovl_do_notify_change(struct ovl_fs *ofs, + struct dentry *upperdentry, + struct iattr *attr) +{ + struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); + struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); + + if (attr->ia_valid & ATTR_UID) + attr->ia_uid = mapped_kuid_user(upper_mnt_userns, + fs_userns, attr->ia_uid); + if (attr->ia_valid & ATTR_GID) + attr->ia_gid = mapped_kgid_user(upper_mnt_userns, + fs_userns, attr->ia_gid); + + return notify_change(upper_mnt_userns, upperdentry, attr, NULL); +} + +static inline int ovl_do_rmdir(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(&init_user_ns, dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; } -static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, + struct dentry *dentry) { - int err = vfs_unlink(&init_user_ns, dir, dentry, NULL); + int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; } -static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) +static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, + struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL); + int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; } -static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_create(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(&init_user_ns, dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } -static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_mkdir(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(&init_user_ns, dir, dentry, mode); + int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } -static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_mknod(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; } -static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_symlink(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(&init_user_ns, dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; } -static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, void *value, - size_t size) +static inline ssize_t ovl_do_getxattr(struct path *path, const char *name, + void *value, size_t size) { - const char *name = ovl_xattr(ofs, ox); - int err = vfs_getxattr(&init_user_ns, dentry, name, value, size); - int len = (value && err > 0) ? err : 0; + int err, len; + + WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); + + err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry, + name, value, size); + len = (value && err > 0) ? err : 0; pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", - dentry, name, min(len, 48), value, size, err); + path->dentry, name, min(len, 48), value, size, err); return err; } +static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, + struct dentry *upperdentry, + enum ovl_xattr ox, void *value, + size_t size) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + + return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size); +} + +static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, + struct path *path, + enum ovl_xattr ox, void *value, + size_t size) +{ + return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size); +} + static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, const void *value, - size_t size) + const char *name, const void *value, + size_t size, int flags) { - const char *name = ovl_xattr(ofs, ox); - int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0); - pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", - dentry, name, min((int)size, 48), value, size, err); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); return err; } +static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const void *value, + size_t size) +{ + return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0); +} + static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox) + const char *name) { - const char *name = ovl_xattr(ofs, ox); - int err = vfs_removexattr(&init_user_ns, dentry, name); + int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } -static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, - struct inode *newdir, struct dentry *newdentry, - unsigned int flags) +static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox) +{ + return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); +} + +static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, + struct dentry *olddentry, struct inode *newdir, + struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_userns = ovl_upper_mnt_userns(ofs), .old_dir = olddir, .old_dentry = olddentry, - .new_mnt_userns = &init_user_ns, + .new_mnt_userns = ovl_upper_mnt_userns(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, @@ -239,22 +311,31 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, return err; } -static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +static inline int ovl_do_whiteout(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(&init_user_ns, dir, dentry); + int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } -static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) +static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0); + struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0); int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return ret; } +static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, + const char *name, + struct dentry *base, int len) +{ + return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) @@ -293,10 +374,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); void ovl_path_lowerdata(struct dentry *dentry, struct path *path); +void ovl_i_path_real(struct inode *inode, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); +const struct ovl_layer *ovl_i_layer_lower(struct inode *inode); const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); @@ -330,9 +414,20 @@ struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry); -bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, - enum ovl_xattr ox); +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, + enum ovl_xattr ox); +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path); + +static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, + struct dentry *upperdentry) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + return ovl_path_check_origin_xattr(ofs, &upperpath); +} + int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); @@ -344,10 +439,9 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path); bool ovl_is_metacopy_dentry(struct dentry *dentry); -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, - int padding); +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) @@ -366,9 +460,15 @@ static inline bool ovl_test_flag(unsigned long flag, struct inode *inode) } static inline bool ovl_is_impuredir(struct super_block *sb, - struct dentry *dentry) + struct dentry *upperdentry) { - return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_IMPURE); + struct ovl_fs *ofs = OVL_FS(sb); + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + + return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); } /* @@ -461,12 +561,13 @@ static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, extern const struct file_operations ovl_dir_operations; struct file *ovl_dir_real_file(const struct file *file, bool want_upper); int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); -void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, + struct list_head *list); void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); -int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, - struct dentry *dentry, int level); +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, + struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); /* @@ -520,16 +621,7 @@ bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir); struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir); struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip); -static inline void ovl_copyattr(struct inode *from, struct inode *to) -{ - to->i_uid = from->i_uid; - to->i_gid = from->i_gid; - to->i_mode = from->i_mode; - to->i_atime = from->i_atime; - to->i_mtime = from->i_mtime; - to->i_ctime = from->i_ctime; - i_size_write(to, i_size_read(from)); -} +void ovl_copyattr(struct inode *to); /* vfs inode flags copied from real to ovl inode */ #define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) @@ -570,12 +662,15 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) -int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode); -struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, +int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry **newdentry, umode_t mode); +struct dentry *ovl_create_real(struct ovl_fs *ofs, + struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr); +int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); +struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); -int ovl_cleanup(struct inode *dir, struct dentry *dentry); -struct dentry *ovl_lookup_temp(struct dentry *workdir); -struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; @@ -591,9 +686,8 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); -int ovl_copy_xattr(struct super_block *sb, struct dentry *old, - struct dentry *new); -int ovl_set_attr(struct dentry *upper, struct kstat *stat); +int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new); +int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 63efee554f69..e1af8f660698 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -90,6 +90,11 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) return ofs->layers[0].mnt; } +static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs) +{ + return mnt_user_ns(ovl_upper_mnt(ofs)); +} + static inline struct ovl_fs *OVL_FS(struct super_block *sb) { return (struct ovl_fs *)sb->s_fs_info; @@ -129,7 +134,7 @@ struct ovl_inode { unsigned long flags; struct inode vfs_inode; struct dentry *__upperdentry; - struct inode *lower; + struct ovl_path lowerpath; /* synchronize copy up and more */ struct mutex lock; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 150fdf3bc68d..78f62cc1797b 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -264,11 +264,11 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } -static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) +static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd) { int err; struct ovl_cache_entry *p; - struct dentry *dentry; + struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); @@ -278,7 +278,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one_len(p->name, dir, p->len); + dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -312,7 +312,7 @@ static inline int ovl_dir_read(struct path *realpath, } while (!err && rdd->count); if (!err && rdd->first_maybe_whiteout && rdd->dentry) - err = ovl_check_whiteouts(realpath->dentry, rdd); + err = ovl_check_whiteouts(realpath, rdd); fput(realfile); @@ -479,7 +479,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) goto get; } } - this = lookup_one_len(p->name, dir, p->len); + this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; @@ -623,8 +623,8 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { - ovl_do_removexattr(ofs, ovl_dentry_upper(dentry), - OVL_XATTR_IMPURE); + ovl_removexattr(ofs, ovl_dentry_upper(dentry), + OVL_XATTR_IMPURE); ovl_drop_write(dentry); } ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); @@ -1001,7 +1001,8 @@ del_entry: return err; } -void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, + struct list_head *list) { struct ovl_cache_entry *p; @@ -1012,7 +1013,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; - dentry = lookup_one_len(p->name, upper, p->len); + dentry = ovl_lookup_upper(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, @@ -1020,7 +1021,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) continue; } if (dentry->d_inode) - ovl_cleanup(upper->d_inode, dentry); + ovl_cleanup(ofs, upper->d_inode, dentry); dput(dentry); } inode_unlock(upper->d_inode); @@ -1064,7 +1065,8 @@ int ovl_check_d_type_supported(struct path *realpath) #define OVL_INCOMPATDIR_NAME "incompat" -static int ovl_workdir_cleanup_recurse(struct path *path, int level) +static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path, + int level) { int err; struct inode *dir = path->dentry->d_inode; @@ -1111,11 +1113,11 @@ static int ovl_workdir_cleanup_recurse(struct path *path, int level) err = -EINVAL; break; } - dentry = lookup_one_len(p->name, path->dentry, p->len); + dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) - err = ovl_workdir_cleanup(dir, path->mnt, dentry, level); + err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level); dput(dentry); if (err) break; @@ -1126,24 +1128,24 @@ out: return err; } -int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, - struct dentry *dentry, int level) +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, + struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) { - return ovl_cleanup(dir, dentry); + return ovl_cleanup(ofs, dir, dentry); } - err = ovl_do_rmdir(dir, dentry); + err = ovl_do_rmdir(ofs, dir, dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; inode_unlock(dir); - err = ovl_workdir_cleanup_recurse(&path, level + 1); + err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); inode_lock_nested(dir, I_MUTEX_PARENT); if (!err) - err = ovl_cleanup(dir, dentry); + err = ovl_cleanup(ofs, dir, dentry); } return err; @@ -1179,7 +1181,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) if (p->len == 2 && p->name[1] == '.') continue; } - index = lookup_one_len(p->name, indexdir, p->len); + index = ovl_lookup_upper(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; @@ -1187,7 +1189,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { - err = ovl_workdir_cleanup(dir, path.mnt, index, 1); + err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1); if (err) break; goto next; @@ -1197,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ - err = ovl_cleanup(dir, index); + err = ovl_cleanup(ofs, dir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if @@ -1213,7 +1215,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) err = ovl_cleanup_and_whiteout(ofs, dir, index); } else { /* Cleanup orphan index entries */ - err = ovl_cleanup(dir, index); + err = ovl_cleanup(ofs, dir, index); } if (err) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 001cdbb8f015..e0a2e0468ee7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -184,7 +184,8 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) oi->version = 0; oi->flags = 0; oi->__upperdentry = NULL; - oi->lower = NULL; + oi->lowerpath.dentry = NULL; + oi->lowerpath.layer = NULL; oi->lowerdata = NULL; mutex_init(&oi->lock); @@ -205,7 +206,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - iput(oi->lower); + dput(oi->lowerpath.dentry); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else @@ -761,7 +762,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, inode_lock_nested(dir, I_MUTEX_PARENT); retry: - work = lookup_one_len(name, ofs->workbasedir, strlen(name)); + work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -778,7 +779,7 @@ retry: goto out_unlock; retried = true; - err = ovl_workdir_cleanup(dir, mnt, work, 0); + err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0); dput(work); if (err == -EINVAL) { work = ERR_PTR(err); @@ -787,7 +788,7 @@ retry: goto retry; } - err = ovl_mkdir_real(dir, &work, attr.ia_mode); + err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); if (err) goto out_dput; @@ -809,19 +810,19 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = vfs_removexattr(&init_user_ns, work, - XATTR_NAME_POSIX_ACL_DEFAULT); + err = ovl_do_removexattr(ofs, work, + XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = vfs_removexattr(&init_user_ns, work, - XATTR_NAME_POSIX_ACL_ACCESS); + err = ovl_do_removexattr(ofs, work, + XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ inode_lock(work->d_inode); - err = notify_change(&init_user_ns, work, &attr, NULL); + err = ovl_do_notify_change(ofs, work, &attr); inode_unlock(work->d_inode); if (err) goto out_dput; @@ -873,10 +874,6 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("filesystem on '%s' not supported\n", name); goto out_put; } - if (is_idmapped_mnt(path->mnt)) { - pr_err("idmapped layers are currently not supported\n"); - goto out_put; - } if (!d_is_dir(path->dentry)) { pr_err("'%s' not a directory\n", name); goto out_put; @@ -1256,8 +1253,9 @@ out: * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and * negative values if error is encountered. */ -static int ovl_check_rename_whiteout(struct dentry *workdir) +static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { + struct dentry *workdir = ofs->workdir; struct inode *dir = d_inode(workdir); struct dentry *temp; struct dentry *dest; @@ -1267,12 +1265,12 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) inode_lock_nested(dir, I_MUTEX_PARENT); - temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0)); + temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); err = PTR_ERR(temp); if (IS_ERR(temp)) goto out_unlock; - dest = ovl_lookup_temp(workdir); + dest = ovl_lookup_temp(ofs, workdir); err = PTR_ERR(dest); if (IS_ERR(dest)) { dput(temp); @@ -1281,14 +1279,14 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT); + err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); if (err) { if (err == -EINVAL) err = 0; goto cleanup_temp; } - whiteout = lookup_one_len(name.name.name, workdir, name.name.len); + whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len); err = PTR_ERR(whiteout); if (IS_ERR(whiteout)) goto cleanup_temp; @@ -1297,11 +1295,11 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) /* Best effort cleanup of whiteout and temp file */ if (err) - ovl_cleanup(dir, whiteout); + ovl_cleanup(ofs, dir, whiteout); dput(whiteout); cleanup_temp: - ovl_cleanup(dir, temp); + ovl_cleanup(ofs, dir, temp); release_dentry_name_snapshot(&name); dput(temp); dput(dest); @@ -1312,16 +1310,17 @@ out_unlock: return err; } -static struct dentry *ovl_lookup_or_create(struct dentry *parent, +static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, + struct dentry *parent, const char *name, umode_t mode) { size_t len = strlen(name); struct dentry *child; inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - child = lookup_one_len(name, parent, len); + child = ovl_lookup_upper(ofs, name, parent, len); if (!IS_ERR(child) && !child->d_inode) - child = ovl_create_real(parent->d_inode, child, + child = ovl_create_real(ofs, parent->d_inode, child, OVL_CATTR(mode)); inode_unlock(parent->d_inode); dput(parent); @@ -1343,7 +1342,7 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs) const char *const *name = volatile_path; for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) { - d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG); + d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG); if (IS_ERR(d)) return PTR_ERR(d); } @@ -1391,7 +1390,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); + temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); ofs->tmpfile = !IS_ERR(temp); if (ofs->tmpfile) dput(temp); @@ -1400,7 +1399,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, /* Check if upper/work fs supports RENAME_WHITEOUT */ - err = ovl_check_rename_whiteout(ofs->workdir); + err = ovl_check_rename_whiteout(ofs); if (err < 0) goto out; @@ -1411,7 +1410,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, /* * Check if upper/work fs supports (trusted|user).overlay.* xattr */ - err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); + err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { ofs->noxattr = true; if (ofs->config.index || ofs->config.metacopy) { @@ -1429,7 +1428,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, } err = 0; } else { - ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); + ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); } /* diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f48284a2a896..87f811c089e4 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -194,6 +194,20 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) return type; } +enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + WARN_ON_ONCE(d_is_dir(dentry)); + + if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type)) + ovl_path_lowerdata(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + struct dentry *ovl_dentry_upper(struct dentry *dentry) { return ovl_upperdentry_dereference(OVL_I(d_inode(dentry))); @@ -236,6 +250,17 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode) return ovl_upperdentry_dereference(OVL_I(inode)); } +void ovl_i_path_real(struct inode *inode, struct path *path) +{ + path->dentry = ovl_i_dentry_upper(inode); + if (!path->dentry) { + path->dentry = OVL_I(inode)->lowerpath.dentry; + path->mnt = OVL_I(inode)->lowerpath.layer->mnt; + } else { + path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb)); + } +} + struct inode *ovl_inode_upper(struct inode *inode) { struct dentry *upperdentry = ovl_i_dentry_upper(inode); @@ -245,7 +270,9 @@ struct inode *ovl_inode_upper(struct inode *inode) struct inode *ovl_inode_lower(struct inode *inode) { - return OVL_I(inode)->lower; + struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry; + + return lowerdentry ? d_inode(lowerdentry) : NULL; } struct inode *ovl_inode_real(struct inode *inode) @@ -443,7 +470,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) void ovl_dir_modified(struct dentry *dentry, bool impurity) { /* Copy mtime/ctime */ - ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry)); + ovl_copyattr(d_inode(dentry)); ovl_dir_version_inc(dentry, impurity); } @@ -466,6 +493,7 @@ bool ovl_is_whiteout(struct dentry *dentry) struct file *ovl_path_open(struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); + struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); int err, acc_mode; if (flags & ~(O_ACCMODE | O_LARGEFILE)) @@ -482,12 +510,12 @@ struct file *ovl_path_open(struct path *path, int flags) BUG(); } - err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN); + err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(&init_user_ns, inode)) + if (inode_owner_or_capable(real_mnt_userns, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); @@ -550,11 +578,11 @@ void ovl_copy_up_end(struct dentry *dentry) ovl_inode_unlock(d_inode(dentry)); } -bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry) +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path) { int res; - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_ORIGIN, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0); /* Zero size value means "copied up but origin unknown" */ if (res >= 0) @@ -563,16 +591,16 @@ bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry) return false; } -bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, - enum ovl_xattr ox) +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, + enum ovl_xattr ox) { int res; char val; - if (!d_is_dir(dentry)) + if (!d_is_dir(path->dentry)) return false; - res = ovl_do_getxattr(OVL_FS(sb), dentry, ox, &val, 1); + res = ovl_path_getxattr(ofs, path, ox, &val, 1); if (res == 1 && val == 'y') return true; @@ -612,7 +640,7 @@ int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, if (ofs->noxattr) return xerr; - err = ovl_do_setxattr(ofs, upperdentry, ox, value, size); + err = ovl_setxattr(ofs, upperdentry, ox, value, size); if (err == -EOPNOTSUPP) { pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox)); @@ -652,8 +680,8 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper) char buf[OVL_PROTATTR_MAX+1]; int res, n; - res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, - OVL_PROTATTR_MAX); + res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf, + OVL_PROTATTR_MAX); if (res < 0) return; @@ -708,7 +736,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper, err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, len, -EPERM); } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) { - err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR); + err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR); if (err == -EOPNOTSUPP || err == -ENODATA) err = 0; } @@ -824,7 +852,7 @@ static void ovl_cleanup_index(struct dentry *dentry) } inode_lock_nested(dir, I_MUTEX_PARENT); - index = lookup_one_len(name.name, indexdir, name.len); + index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); err = PTR_ERR(index); if (IS_ERR(index)) { index = NULL; @@ -834,7 +862,7 @@ static void ovl_cleanup_index(struct dentry *dentry) dir, index); } else { /* Cleanup orphan index entries */ - err = ovl_cleanup(dir, index); + err = ovl_cleanup(ofs, dir, index); } inode_unlock(dir); @@ -943,15 +971,15 @@ err: } /* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry) +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path) { int res; /* Only regular files can have metacopy xattr */ - if (!S_ISREG(d_inode(dentry)->i_mode)) + if (!S_ISREG(d_inode(path->dentry)->i_mode)) return 0; - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_METACOPY, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; @@ -987,13 +1015,12 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, - int padding) +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding) { int res; char *s, *next, *buf = NULL; - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; if (res < 0) @@ -1005,7 +1032,7 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, if (!buf) return ERR_PTR(-ENOMEM); - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, buf, res); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res); if (res < 0) goto fail; if (res == 0) @@ -1060,3 +1087,33 @@ int ovl_sync_status(struct ovl_fs *ofs) return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); } + +/* + * ovl_copyattr() - copy inode attributes from layer to ovl inode + * + * When overlay copies inode information from an upper or lower layer to the + * relevant overlay inode it will apply the idmapping of the upper or lower + * layer when doing so ensuring that the ovl inode ownership will correctly + * reflect the ownership of the idmapped upper or lower layer. For example, an + * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to + * map any lower or upper inode owned by id 1001 to id 1000. These mapping + * helpers are nops when the relevant layer isn't idmapped. + */ +void ovl_copyattr(struct inode *inode) +{ + struct path realpath; + struct inode *realinode; + struct user_namespace *real_mnt_userns; + + ovl_i_path_real(inode, &realpath); + realinode = d_inode(realpath.dentry); + real_mnt_userns = mnt_user_ns(realpath.mnt); + + inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); + inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + inode->i_mode = realinode->i_mode; + inode->i_atime = realinode->i_atime; + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + i_size_write(inode, i_size_read(realinode)); +} diff --git a/fs/pipe.c b/fs/pipe.c index e140ea150bbb..74ae9fafd25a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait) unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ - pipe->poll_usage = 1; + WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. @@ -1245,30 +1245,33 @@ unsigned int round_pipe_size(unsigned long size) /* * Resize the pipe ring to a number of slots. + * + * Note the pipe can be reduced in capacity, but only if the current + * occupancy doesn't exceed nr_slots; if it does, EBUSY will be + * returned instead. */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; unsigned int head, tail, mask, n; - /* - * We can shrink the pipe, if arg is greater than the ring occupancy. - * Since we don't expect a lot of shrink+grow operations, just free and - * allocate again like we would do for growing. If the pipe currently - * contains more buffers than arg, then return busy. - */ - mask = pipe->ring_size - 1; - head = pipe->head; - tail = pipe->tail; - n = pipe_occupancy(pipe->head, pipe->tail); - if (nr_slots < n) - return -EBUSY; - bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; + spin_lock_irq(&pipe->rd_wait.lock); + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + + n = pipe_occupancy(head, tail); + if (nr_slots < n) { + spin_unlock_irq(&pipe->rd_wait.lock); + kfree(bufs); + return -EBUSY; + } + /* * The pipe array wraps around, so just start the new one at zero * and adjust the indices. @@ -1300,6 +1303,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) pipe->tail = tail; pipe->head = head; + spin_unlock_irq(&pipe->rd_wait.lock); + /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index c1031843cc6a..8dfa36a99c74 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3154,6 +3154,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_KSM +static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "%lu\n", mm->ksm_merging_pages); + mmput(mm); + } + + return 0; +} +#endif /* CONFIG_KSM */ + #ifdef CONFIG_STACKLEAK_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) @@ -3285,6 +3301,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3618,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f2132407e133..587b91d9d998 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, proc_set_user(ent, (*parent)->uid, (*parent)->gid); ent->proc_dops = &proc_misc_dentry_ops; + /* Revalidate everything under /proc/${pid}/net */ + if ((*parent)->proc_dops == &proc_net_dentry_ops) + pde_force_lookup(ent); out: return ent; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 982e694aae77..dff921f7ca33 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && - start < m->addr + m->size) + struct kcore_list *iter; + + m = NULL; + list_for_each_entry(iter, &kclist_head, list) { + if (start >= iter->addr && + start < iter->addr + iter->size) { + m = iter; break; + } } } @@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) page_offline_freeze(); } - if (&m->list == &kclist_head) { + if (!m) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; } - m = NULL; /* skip the list anchor */ goto skip; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6fa761c9cc78..6e89f0e2fd20 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); +#ifdef CONFIG_ZSWAP + seq_printf(m, "Zswap: %8lu kB\n", + (unsigned long)(zswap_pool_total_size >> 10)); + seq_printf(m, "Zswapped: %8lu kB\n", + (unsigned long)atomic_read(&zswap_stored_pages) << + (PAGE_SHIFT - 10)); +#endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index e1cfeda397f3..913e5acefbb6 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -376,6 +376,9 @@ static __net_init int proc_net_ns_init(struct net *net) proc_set_user(netd, uid, gid); + /* Seed dentry revalidation for /proc/${pid}/net */ + pde_force_lookup(netd); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 7d9cfc730bd4..021e83fe831f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,6 +19,9 @@ #include <linux/kmemleak.h> #include "internal.h" +#define list_for_each_table_entry(entry, table) \ + for ((entry) = (table); (entry)->procname; (entry)++) + static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; @@ -26,7 +29,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 }; +const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; @@ -215,15 +218,19 @@ static void init_header(struct ctl_table_header *head, INIT_HLIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) + + list_for_each_table_entry(entry, table) { node->header = head; + node++; + } } } static void erase_header(struct ctl_table_header *head) { struct ctl_table *entry; - for (entry = head->ctl_table; entry->procname; entry++) + + list_for_each_table_entry(entry, head->ctl_table) erase_entry(head, entry); } @@ -248,7 +255,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) err = insert_links(header); if (err) goto fail_links; - for (entry = header->ctl_table; entry->procname; entry++) { + list_for_each_table_entry(entry, header->ctl_table) { err = insert_entry(header, entry); if (err) goto fail; @@ -978,7 +985,6 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 2); memcpy(new_name, name, namelen); - new_name[namelen] = '\0'; table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table); @@ -1130,35 +1136,36 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) static int sysctl_check_table(const char *path, struct ctl_table *table) { + struct ctl_table *entry; int err = 0; - for (; table->procname; table++) { - if (table->child) - err |= sysctl_err(path, table, "Not a file"); - - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_douintvec) || - (table->proc_handler == proc_douintvec_minmax) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dou8vec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - err |= sysctl_err(path, table, "No data"); - if (!table->maxlen) - err |= sysctl_err(path, table, "No maxlen"); + list_for_each_table_entry(entry, table) { + if (entry->child) + err |= sysctl_err(path, entry, "Not a file"); + + if ((entry->proc_handler == proc_dostring) || + (entry->proc_handler == proc_dointvec) || + (entry->proc_handler == proc_douintvec) || + (entry->proc_handler == proc_douintvec_minmax) || + (entry->proc_handler == proc_dointvec_minmax) || + (entry->proc_handler == proc_dou8vec_minmax) || + (entry->proc_handler == proc_dointvec_jiffies) || + (entry->proc_handler == proc_dointvec_userhz_jiffies) || + (entry->proc_handler == proc_dointvec_ms_jiffies) || + (entry->proc_handler == proc_doulongvec_minmax) || + (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!entry->data) + err |= sysctl_err(path, entry, "No data"); + if (!entry->maxlen) + err |= sysctl_err(path, entry, "No maxlen"); else - err |= sysctl_check_table_array(path, table); + err |= sysctl_check_table_array(path, entry); } - if (!table->proc_handler) - err |= sysctl_err(path, table, "No proc_handler"); + if (!entry->proc_handler) + err |= sysctl_err(path, entry, "No proc_handler"); - if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) - err |= sysctl_err(path, table, "bogus .mode 0%o", - table->mode); + if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) + err |= sysctl_err(path, entry, "bogus .mode 0%o", + entry->mode); } return err; } @@ -1174,7 +1181,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table name_bytes = 0; nr_entries = 0; - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { nr_entries++; name_bytes += strlen(entry->procname) + 1; } @@ -1191,14 +1198,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + nr_entries); link_name = (char *)&link_table[nr_entries + 1]; + link = link_table; - for (link = link_table, entry = table; entry->procname; link++, entry++) { + list_for_each_table_entry(entry, table) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = link_root; link_name += len; + link++; } init_header(links, dir->header.root, dir->header.set, node, link_table); links->nreg = nr_entries; @@ -1213,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir, struct ctl_table *entry, *link; /* Are there links available for every entry in table? */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { const char *procname = entry->procname; link = find_entry(&head, dir, procname, strlen(procname)); if (!link) @@ -1226,7 +1235,7 @@ static bool get_links(struct ctl_dir *dir, } /* The checks passed. Increase the registration count on the links */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { const char *procname = entry->procname; link = find_entry(&head, dir, procname, strlen(procname)); head->nreg++; @@ -1329,11 +1338,11 @@ struct ctl_table_header *__register_sysctl_table( struct ctl_node *node; int nr_entries = 0; - for (entry = table; entry->procname; entry++) + list_for_each_table_entry(entry, table) nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT); if (!header) return NULL; @@ -1456,7 +1465,7 @@ static int count_subheaders(struct ctl_table *table) if (!table || !table->procname) return 1; - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { if (entry->child) nr_subheaders += count_subheaders(entry->child); else @@ -1475,7 +1484,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, int nr_dirs = 0; int err = -ENOMEM; - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { if (entry->child) nr_dirs++; else @@ -1492,7 +1501,9 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, goto out; ctl_table_arg = files; - for (new = files, entry = table; entry->procname; entry++) { + new = files; + + list_for_each_table_entry(entry, table) { if (entry->child) continue; *new = *entry; @@ -1516,7 +1527,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, } /* Recurse into the subdirectories. */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, table) { char *child_pos; if (!entry->child) @@ -1670,7 +1681,7 @@ static void put_links(struct ctl_table_header *header) if (IS_ERR(core_parent)) return; - for (entry = header->ctl_table; entry->procname; entry++) { + list_for_each_table_entry(entry, header->ctl_table) { struct ctl_table_header *link_head; struct ctl_table *link; const char *name = entry->procname; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f46060eb91b5..2d04e3470d4c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1421,6 +1421,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); + if (pte_marker_entry_uffd_wp(entry)) + flags |= PM_UFFD_WP; } if (page && !PageAnon(page)) @@ -1556,10 +1558,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; + if (huge_pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); + } else if (pte_swp_uffd_wp_any(pte)) { + flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { @@ -1873,8 +1880,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, return 0; page = pte_page(huge_pte); - if (!page) - return 0; md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 6f1b8ddc6f7a..4eaeb645e759 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -26,6 +26,7 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/uio.h> #include <linux/cc_platform.h> #include <asm/io.h> #include "internal.h" @@ -128,9 +129,8 @@ static int open_vmcore(struct inode *inode, struct file *file) } /* Reads a page from the oldmem device from given offset. */ -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -152,29 +152,23 @@ ssize_t read_from_oldmem(char *buf, size_t count, /* If pfn is not ram, return zeros for sparse dump files */ if (!pfn_is_ram(pfn)) { - tmp = 0; - if (!userbuf) - memset(buf, 0, nr_bytes); - else if (clear_user(buf, nr_bytes)) - tmp = -EFAULT; + tmp = iov_iter_zero(nr_bytes, iter); } else { if (encrypted) - tmp = copy_oldmem_page_encrypted(pfn, buf, + tmp = copy_oldmem_page_encrypted(iter, pfn, nr_bytes, - offset, - userbuf); + offset); else - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + tmp = copy_oldmem_page(iter, pfn, nr_bytes, + offset); } - if (tmp < 0) { + if (tmp < nr_bytes) { srcu_read_unlock(&vmcore_cb_srcu, idx); - return tmp; + return -EFAULT; } *ppos += nr_bytes; count -= nr_bytes; - buf += nr_bytes; read += nr_bytes; ++pfn; offset = 0; @@ -203,7 +197,12 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, false); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, false); } /* @@ -211,7 +210,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); } /* @@ -228,29 +233,14 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, /* * Architectures which support memory encryption override this. */ -ssize_t __weak -copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, + unsigned long pfn, size_t csize, unsigned long offset) { - return copy_oldmem_page(pfn, buf, csize, offset, userbuf); -} - -/* - * Copy to either kernel or user space - */ -static int copy_to(void *target, void *src, size_t size, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *) target, src, size)) - return -EFAULT; - } else { - memcpy(target, src, size); - } - return 0; + return copy_oldmem_page(iter, pfn, csize, offset); } #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP -static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) { struct vmcoredd_node *dump; u64 offset = 0; @@ -263,14 +253,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (copy_to(dst, buf, tsz, userbuf)) { + if (copy_to_iter(buf, tsz, iter) < tsz) { ret = -EFAULT; goto out_unlock; } size -= tsz; start += tsz; - dst += tsz; /* Leave now if buffer filled already */ if (!size) @@ -326,33 +315,28 @@ out_unlock: /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, - int userbuf) +static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { ssize_t acc = 0, tmp; size_t tsz; u64 start; struct vmcore *m = NULL; - if (buflen == 0 || *fpos >= vmcore_size) + if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; - /* trim buflen to not go beyond EOF */ - if (buflen > vmcore_size - *fpos) - buflen = vmcore_size - *fpos; + iov_iter_truncate(iter, vmcore_size - *fpos); /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) + tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter)); + if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -373,35 +357,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, /* Read device dumps */ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - - (size_t)*fpos, buflen); + (size_t)*fpos, iov_iter_count(iter)); start = *fpos - elfcorebuf_sz; - if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + if (vmcoredd_copy_dumps(iter, start, tsz)) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (!buflen) + if (!iov_iter_count(iter)) return acc; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Read remaining elf notes */ - tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, + iov_iter_count(iter)); kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; - if (copy_to(buffer, kaddr, tsz, userbuf)) + if (copy_to_iter(kaddr, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -409,19 +390,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < m->offset + m->size) { tsz = (size_t)min_t(unsigned long long, m->offset + m->size - *fpos, - buflen); + iov_iter_count(iter)); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, - userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + tmp = read_from_oldmem(iter, tsz, &start, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } } @@ -429,15 +408,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, return acc; } -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) { - return __read_vmcore((__force char *) buffer, buflen, fpos, 1); + return __read_vmcore(iter, &iocb->ki_pos); } /* * The vmcore fault handler uses the page cache and fills data using the - * standard __vmcore_read() function. + * standard __read_vmcore() function. * * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). @@ -447,9 +425,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; pgoff_t index = vmf->pgoff; + struct iov_iter iter; + struct kvec kvec; struct page *page; loff_t offset; - char *buf; int rc; page = find_or_create_page(mapping, index, GFP_KERNEL); @@ -457,8 +436,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (!PageUptodate(page)) { offset = (loff_t) index << PAGE_SHIFT; - buf = __va((page_to_pfn(page) << PAGE_SHIFT)); - rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + kvec.iov_base = page_address(page); + kvec.iov_len = PAGE_SIZE; + iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + + rc = __read_vmcore(&iter, &offset); if (rc < 0) { unlock_page(page); put_page(page); @@ -708,7 +690,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) static const struct proc_ops vmcore_proc_ops = { .proc_open = open_vmcore, - .proc_read = read_vmcore, + .proc_read_iter = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, }; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a74aef99bd3d..09d1307959d0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -79,6 +79,7 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include "../internal.h" /* ugh */ #include <linux/uaccess.h> @@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty); int dquot_acquire(struct dquot *dquot) { int ret = 0, ret2 = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) @@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire); int dquot_commit(struct dquot *dquot) { int ret = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); if (!clear_dquot_dirty(dquot)) goto out_lock; /* Inactive dquot can be only if there was error during read/init @@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot) else ret = -EIO; out_lock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit); int dquot_release(struct dquot *dquot) { int ret = 0, ret2 = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; @@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot) } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_dqlock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } diff --git a/fs/read_write.c b/fs/read_write.c index e643aec2b0ef..b1b1cdfee9d3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -682,6 +682,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, return ksys_pread64(fd, buf, count, pos); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) +COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, + size_t, count, compat_arg_u64_dual(pos)) +{ + return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); +} +#endif + ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { @@ -708,6 +716,14 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, return ksys_pwrite64(fd, buf, count, pos); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) +COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, compat_arg_u64_dual(pos)) +{ + return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); +} +#endif + static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 7ab8a58c29b6..9456a2032224 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -931,6 +931,38 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) } EXPORT_SYMBOL(seq_list_next); +struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos) +{ + struct list_head *lh; + + list_for_each_rcu(lh, head) + if (pos-- == 0) + return lh; + + return NULL; +} +EXPORT_SYMBOL(seq_list_start_rcu); + +struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos) +{ + if (!pos) + return head; + + return seq_list_start_rcu(head, pos - 1); +} +EXPORT_SYMBOL(seq_list_start_head_rcu); + +struct list_head *seq_list_next_rcu(void *v, struct list_head *head, + loff_t *ppos) +{ + struct list_head *lh; + + lh = list_next_rcu((struct list_head *)v); + ++*ppos; + return lh == head ? NULL : lh; +} +EXPORT_SYMBOL(seq_list_next_rcu); + /** * seq_hlist_start - start an iteration of a hlist * @head: the head of the hlist diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 0507aecfc669..2cab413fffee 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -1244,6 +1244,106 @@ struct file_zero_data_information { __le64 BeyondFinalZero; } __packed; +/* See MS-FSCC 2.3.7 */ +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + +/* See MS-FSCC 2.3.8 */ +#define DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC 0x00000001 +struct duplicate_extents_to_file_ex { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ + __le32 Flags; + __le32 Reserved; +} __packed; + + +/* See MS-FSCC 2.3.20 */ +struct fsctl_get_integrity_information_rsp { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; + __le32 ChecksumChunkSizeInBytes; + __le32 ClusterSizeInBytes; +} __packed; + +/* See MS-FSCC 2.3.55 */ +struct fsctl_query_file_regions_req { + __le64 FileOffset; + __le64 Length; + __le32 DesiredUsage; + __le32 Reserved; +} __packed; + +/* DesiredUsage flags see MS-FSCC 2.3.56.1 */ +#define FILE_USAGE_INVALID_RANGE 0x00000000 +#define FILE_USAGE_VALID_CACHED_DATA 0x00000001 +#define FILE_USAGE_NONCACHED_DATA 0x00000002 + +struct file_region_info { + __le64 FileOffset; + __le64 Length; + __le32 DesiredUsage; + __le32 Reserved; +} __packed; + +/* See MS-FSCC 2.3.56 */ +struct fsctl_query_file_region_rsp { + __le32 Flags; + __le32 TotalRegionEntryCount; + __le32 RegionEntryCount; + __u32 Reserved; + struct file_region_info Regions[]; +} __packed; + +/* See MS-FSCC 2.3.58 */ +struct fsctl_query_on_disk_vol_info_rsp { + __le64 DirectoryCount; + __le64 FileCount; + __le16 FsFormatMajVersion; + __le16 FsFormatMinVersion; + __u8 FsFormatName[24]; + __le64 FormatTime; + __le64 LastUpdateTime; + __u8 CopyrightInfo[68]; + __u8 AbstractInfo[68]; + __u8 FormatImplInfo[68]; + __u8 LastModifyImplInfo[68]; +} __packed; + +/* See MS-FSCC 2.3.73 */ +struct fsctl_set_integrity_information_req { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; +} __packed; + +/* See MS-FSCC 2.3.75 */ +struct fsctl_set_integrity_info_ex_req { + __u8 EnableIntegrity; + __u8 KeepState; + __u16 Reserved; + __le32 Flags; + __u8 Version; + __u8 Reserved2[7]; +} __packed; + +/* Integrity ChecksumAlgorithm choices for above */ +#define CHECKSUM_TYPE_NONE 0x0000 +#define CHECKSUM_TYPE_CRC64 0x0002 +#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ + +/* Integrity flags for above */ +#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 + /* Reparse structures - see MS-FSCC 2.1.2 */ /* struct fsctl_reparse_info_req is empty, only response structs (see below) */ @@ -1304,13 +1404,6 @@ struct validate_negotiate_info_rsp { __le16 Dialect; /* Dialect in use for the connection */ } __packed; -struct duplicate_extents_to_file { - __u64 PersistentFileHandle; /* source file handle, opaque endianness */ - __u64 VolatileFileHandle; - __le64 SourceFileOffset; - __le64 TargetFileOffset; - __le64 ByteCount; /* Bytes to be copied */ -} __packed; /* Possible InfoType values */ #define SMB2_O_INFO_FILE 0x01 @@ -1419,6 +1512,7 @@ struct smb2_query_info_rsp { * PDU query infolevel structure definitions */ +/* See MS-FSCC 2.3.52 */ struct file_allocated_range_buffer { __le64 file_offset; __le64 length; diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index d51939c43ad7..edd7fc2a7921 100644 --- a/fs/smbfs_common/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -88,21 +88,27 @@ #define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */ #define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */ #define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */ +#define FSCTL_MARK_HANDLE 0x000900FC /* BB add struct */ #define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */ #define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */ #define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ +#define FSCTL_QUERY_ON_DISK_VOLUME_INFO 0x0009013C #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C +#define FSCTL_QUERY_FILE_REGIONS 0x00090284 #define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */ #define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380 #define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 #define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b #define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF +#define FSCTL_OFFLOAD_READ 0x00094264 /* BB add struct */ +#define FSCTL_OFFLOAD_WRITE 0x00098268 /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ #define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX 0x000983E8 #define FSCTL_SIS_LINK_FILES 0x0009C104 #define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ diff --git a/fs/stat.c b/fs/stat.c index 5c2c94464e8b..9ced8860e0f3 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -659,7 +659,7 @@ SYSCALL_DEFINE5(statx, return ret; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT) static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; diff --git a/fs/sync.c b/fs/sync.c index c7690016453e..dc725914e1ed 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -373,6 +373,15 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, return ksys_sync_file_range(fd, offset, nbytes, flags); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE) +COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset), + compat_arg_u64_dual(nbytes), unsigned int, flags) +{ + return ksys_sync_file_range(fd, compat_arg_u64_glue(offset), + compat_arg_u64_glue(nbytes), flags); +} +#endif + /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, diff --git a/fs/sysv/super.c b/fs/sysv/super.c index d1def0771a40..3365a30dc1e0 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -312,7 +312,9 @@ static int complete_read_super(struct super_block *sb, int silent, int size) sbi->s_firstinodezone = 2; flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - + if (sbi->s_firstdatazone < sbi->s_firstinodezone) + return 0; + sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; sbi->s_inodes_per_block = bsize >> 6; sbi->s_inodes_per_block_1 = (bsize >> 6)-1; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de7252715b12..81d26abf486f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * Only one instances directory is allowed. * - * The instances directory is special as it allows for mkdir and rmdir to + * The instances directory is special as it allows for mkdir and rmdir * to be done by userspace. When a mkdir or rmdir is performed, the inode * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index c0b84e960b20..e8b9b756f0ac 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -65,7 +65,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) */ static int run_gc(struct ubifs_info *c) { - int err, lnum; + int lnum; /* Make some free space by garbage-collecting dirty space */ down_read(&c->commit_sem); @@ -76,10 +76,7 @@ static int run_gc(struct ubifs_info *c) /* GC freed one LEB, return it to lprops */ dbg_budg("GC freed LEB %d", lnum); - err = ubifs_return_leb(c, lnum); - if (err) - return err; - return 0; + return ubifs_return_leb(c, lnum); } /** diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e4f193eae4b2..e4c4761aff7f 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -677,7 +677,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, int err; err = security_inode_init_security(inode, dentry, qstr, - &init_xattrs, 0); + &init_xattrs, NULL); if (err) { struct ubifs_info *c = dentry->i_sb->s_fs_info; ubifs_err(c, "cannot initialize security for inode %lu, error %d", diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index aa0c47cb0d16..e943370107d0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,6 +29,7 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/swapops.h> int sysctl_unprivileged_userfaultfd __read_mostly; @@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (huge_pte_none(pte)) + if (huge_pte_none_mostly(pte)) ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; @@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte = pte_offset_map(pmd, address); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (pte_none(*pte)) + if (pte_none_mostly(*pte)) ret = true; if (!pte_write(*pte) && (reason & VM_UFFD_WP)) ret = true; @@ -1255,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* FIXME: add WP support to hugetlbfs and shmem */ - if (vm_flags & VM_UFFD_WP) { - if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) - return false; - } - - if (vm_flags & VM_UFFD_MINOR) { - if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) - return false; - } - - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} - static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1954,6 +1939,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; #endif +#ifndef CONFIG_PTE_MARKER_UFFD_WP + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; +#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 04611a1068b4..b056cfc6398e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -102,6 +102,7 @@ xfs-y += xfs_log.o \ xfs_buf_item_recover.o \ xfs_dquot_item_recover.o \ xfs_extfree_item.o \ + xfs_attr_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ xfs_inode_item_recover.o \ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1e4ee042d52f..3e920cf1b454 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -173,7 +173,6 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -192,7 +191,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b52ed339727f..d3f2886fdc08 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2511,7 +2511,7 @@ __xfs_free_extent_later( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); @@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist( xfs_agblock_t bno; __be32 *agfl_bno; int error; - int logflags; + uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; @@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist( struct xfs_perag *pag; __be32 *blockp; int error; - int logflags; + uint32_t logflags; __be32 *agfl_bno; int startoff; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d4c057b764f9..84ca09b2223f 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -121,7 +121,7 @@ void xfs_alloc_log_agf( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields);/* mask of fields to be logged (XFS_AGF_...) */ + uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* * Interface for inode allocation to force the pag data to be initialized. diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 23523b802539..1824f61621a2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -24,6 +24,10 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" + +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -53,26 +57,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, - struct xfs_da_state **state); -STATIC int xfs_attr_fillstate(xfs_da_state_t *state); -STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp); -STATIC int xfs_attr_node_removename(struct xfs_da_args *args, - struct xfs_da_state *state); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip) || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0)) + if (!XFS_IFORK_Q(ip)) + return 0; + if (!ip->i_afp) + return 0; + if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0) return 0; return 1; } @@ -97,6 +97,123 @@ xfs_attr_is_leaf( return imap.br_startoff == 0 && imap.br_blockcount == 1; } +/* + * XXX (dchinner): name path state saving and refilling is an optimisation to + * avoid needing to look up name entries after rolling transactions removing + * remote xattr blocks between the name entry lookup and name entry removal. + * This optimisation got sidelined when combining the set and remove state + * machines, but the code has been left in place because it is worthwhile to + * restore the optimisation once the combined state machine paths have settled. + * + * This comment is a public service announcement to remind Future Dave that he + * still needs to restore this code to working order. + */ +#if 0 +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +static int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +static int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} +#else +static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; } +#endif + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -166,7 +283,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -STATIC int +int xfs_attr_calc_size( struct xfs_da_args *args, int *local) @@ -199,6 +316,33 @@ xfs_attr_calc_size( return nblks; } +/* Initialize transaction reservation for attr operations */ +void +xfs_init_attr_trans( + struct xfs_da_args *args, + struct xfs_trans_res *tres, + unsigned int *total) +{ + struct xfs_mount *mp = args->dp->i_mount; + + if (args->value) { + tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; + *total = args->total; + } else { + *tres = M_RES(mp)->tr_attrrm; + *total = XFS_ATTRRM_SPACE_RES(mp); + } +} + +/* + * Add an attr to a shortform fork. If there is no space, + * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC. + * to use. + */ STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, @@ -230,411 +374,487 @@ xfs_attr_try_sf_addname( return error; } -/* - * Check to see if the attr should be upgraded from non-existent or shortform to - * single-leaf-block attribute list. - */ -static inline bool -xfs_attr_is_shortform( - struct xfs_inode *ip) +static int +xfs_attr_sf_addname( + struct xfs_attr_intent *attr) { - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + int error = 0; + + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + ASSERT(!error || error == -EEXIST); + attr->xattri_dela_state = XFS_DAS_DONE; + goto out; + } + + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a concurrent AIL + * push cannot grab the half-baked leaf buffer and run into problems + * with the write verifier. + */ + xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); + attr->xattri_dela_state = XFS_DAS_LEAF_ADD; +out: + trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); + return error; } /* - * Checks to see if a delayed attribute transaction should be rolled. If so, - * transaction is finished or rolled as needed. + * Handle the state change on completion of a multi-state attr operation. + * + * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first + * modification in a attr replace operation and we still have to do the second + * state, indicated by @replace_state. + * + * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on + * completion of the second half of the attr replace operation we correctly + * signal that it is done. */ -STATIC int -xfs_attr_trans_roll( - struct xfs_delattr_context *dac) +static enum xfs_delattr_state +xfs_attr_complete_op( + struct xfs_attr_intent *attr, + enum xfs_delattr_state replace_state) { - struct xfs_da_args *args = dac->da_args; - int error; + struct xfs_da_args *args = attr->xattri_da_args; + bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + args->op_flags &= ~XFS_DA_OP_REPLACE; + if (do_replace) { + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; + } + return XFS_DAS_DONE; +} + +static int +xfs_attr_leaf_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + ASSERT(xfs_attr_is_leaf(args->dp)); + + /* + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. The held buffer is no longer valid + * after this call, regardless of the result. + */ + error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); + attr->xattri_leaf_bp = NULL; + + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; - if (dac->flags & XFS_DAC_DEFER_FINISH) { /* - * The caller wants us to finish all the deferred ops so that we - * avoid pinning the log tail with a large number of deferred - * ops. + * We're not in leaf format anymore, so roll the transaction and + * retry the add to the newly allocated node block. */ - dac->flags &= ~XFS_DAC_DEFER_FINISH; - error = xfs_defer_finish(&args->trans); - } else - error = xfs_trans_roll_inode(&args->trans, args->dp); + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + goto out; + } + if (error) + return error; + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_LEAF_REPLACE); +out: + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; } /* - * Set the attribute specified in @args. + * Add an entry to a node format attr tree. + * + * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell + * the difference between leaf + remote attr blocks and a node format tree, + * so we may still end up having to convert from leaf to node format here. */ -int -xfs_attr_set_args( - struct xfs_da_args *args) +static int +xfs_attr_node_addname( + struct xfs_attr_intent *attr) { - struct xfs_buf *leaf_bp = NULL; - int error = 0; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_da_args *args = attr->xattri_da_args; + int error; - do { - error = xfs_attr_set_iter(&dac, &leaf_bp); - if (error != -EAGAIN) - break; + ASSERT(!attr->xattri_leaf_bp); + + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) { - if (leaf_bp) - xfs_trans_brelse(args->trans, leaf_bp); + error = xfs_attr_node_try_addname(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) return error; - } - } while (true); + /* + * No state change, we really are in node form now + * but we need the transaction rolled to continue. + */ + goto out; + } + if (error) + return error; + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_NODE_REPLACE); +out: + trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); return error; } -STATIC int -xfs_attr_sf_addname( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static int +xfs_attr_rmtval_alloc( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; + struct xfs_da_args *args = attr->xattri_da_args; int error = 0; /* - * Try to add the attr to the attribute list in the inode. + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. */ - error = xfs_attr_try_sf_addname(dp, args); + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); + if (error) + return error; + /* Roll the transaction only if there is more to allocate. */ + if (attr->xattri_blkcnt > 0) + goto out; + } - /* Should only be 0, -EEXIST or -ENOSPC */ - if (error != -ENOSPC) + error = xfs_attr_rmtval_set_value(args); + if (error) return error; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + ++attr->xattri_dela_state); /* - * It won't fit in the shortform, transform to a leaf block. GROT: - * another possible req'mt for a double-split btree op. + * If we are not doing a rename, we've finished the operation but still + * have to clear the incomplete flag protecting the new attr from + * exposing partially initialised state if we crash during creation. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); - if (error) - return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + error = xfs_attr3_leaf_clearflag(args); +out: + trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +static int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. */ - xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_attr_fillstate(state); + if (error) + return error; /* - * We're still in XFS_DAS_UNINIT state here. We've converted - * the attr fork to leaf format and will restart with the leaf - * add. + * Mark the attribute as INCOMPLETE */ - trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); - dac->flags |= XFS_DAC_DEFER_FINISH; - return -EAGAIN; + return xfs_attr3_leaf_setflag(args); +} + +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); } /* - * Set the attribute specified in @args. - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - * returned. + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. */ -int -xfs_attr_set_iter( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static +int xfs_attr_node_removename_setup( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = NULL; - int forkoff, error = 0; - - /* State machine switch */ - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - /* - * If the fork is shortform, attempt to add the attr. If there - * is no space, this converts to leaf format and returns - * -EAGAIN with the leaf buffer held across the roll. The caller - * will deal with a transaction roll error, but otherwise - * release the hold once we return with a clean transaction. - */ - if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(dac, leaf_bp); - if (*leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, *leaf_bp); - *leaf_bp = NULL; - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state; + int error; - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, *leaf_bp); - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the - * transaction once more. The goal here is to - * call node_addname with the inode and - * transaction in the same state (inode locked - * and joined, transaction clean) no matter how - * we got to this step. - * - * At this point, we are still in - * XFS_DAS_UNINIT, but when we come back, we'll - * be a node, so we'll fall down into the node - * handling code below - */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } else if (error) { - return error; - } + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + if (error != -EEXIST) + goto out; + error = 0; - dac->dela_state = XFS_DAS_FOUND_LBLK; - } else { - error = xfs_attr_node_addname_find_attr(dac); - if (error) - return error; + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_node_addname(dac); - if (error) - return error; + error = xfs_attr_leaf_mark_incomplete(args, state); + if (error) + goto out; + if (args->rmtblkno > 0) + error = xfs_attr_rmtval_invalidate(args); +out: + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } - dac->dela_state = XFS_DAS_FOUND_NBLK; - } - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FOUND_LBLK: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ + return error; +} - /* Open coded xfs_attr_rmtval_set without trans handling */ - if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { - dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; - } - } +/* + * Remove the original attr we have just replaced. This is dependent on the + * original lookup and insert placing the old attr in args->blkno/args->index + * and the new attr in args->blkno2/args->index2. + */ +static int +xfs_attr_leaf_remove_attr( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff; + int error; - /* - * Repeat allocating remote blocks for the attr value until - * blkcnt drops to zero. - */ - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return(dac->dela_state, - args->dp); - return -EAGAIN; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; + xfs_attr3_leaf_remove(bp, args); - /* - * If this is not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - return error; - } + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in - * series. - */ - dac->dela_state = XFS_DAS_FLIP_LFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FLIP_LFLAG: - /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; + return error; +} - fallthrough; - case XFS_DAS_RM_LBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_LBLK; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - if (error) - return error; +/* + * Shrink an attribute from leaf to shortform. Used by the node format remove + * path when the node format collapses to a single block and so we have to check + * if it can be collapsed further. + */ +static int +xfs_attr_leaf_shrink( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp; + int forkoff; + int error; - dac->dela_state = XFS_DAS_RD_LEAF; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } + if (!xfs_attr_is_leaf(dp)) + return 0; - fallthrough; - case XFS_DAS_RD_LEAF: - /* - * This is the last step for leaf format. Read the block with - * the old attr, remove the old attr, check for shortform - * conversion and return. - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; - xfs_attr3_leaf_remove(bp, args); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else { + xfs_trans_brelse(args->trans, bp); + } - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ + return error; +} - return error; +/* + * Run the attribute operation specified in @attr. + * + * This routine is meant to function as a delayed operation and will set the + * state to XFS_DAS_DONE when the operation is complete. Calling functions will + * need to handle this, and recall the function until either an error or + * XFS_DAS_DONE is detected. + */ +int +xfs_attr_set_iter( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; - case XFS_DAS_FOUND_NBLK: - /* - * Find space for remote blocks and fall into the allocation - * state. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; + /* State machine switch */ +next_state: + switch (attr->xattri_dela_state) { + case XFS_DAS_UNINIT: + ASSERT(0); + return -EFSCORRUPTED; + case XFS_DAS_SF_ADD: + return xfs_attr_sf_addname(attr); + case XFS_DAS_LEAF_ADD: + return xfs_attr_leaf_addname(attr); + case XFS_DAS_NODE_ADD: + return xfs_attr_node_addname(attr); + + case XFS_DAS_SF_REMOVE: + error = xfs_attr_sf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_LEAF_REMOVE: + error = xfs_attr_leaf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_NODE_REMOVE: + error = xfs_attr_node_removename_setup(attr); + if (error == -ENOATTR && + (args->op_flags & XFS_DA_OP_RECOVERY)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + error = 0; + break; } + if (error) + return error; + attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; + if (args->rmtblkno == 0) + attr->xattri_dela_state++; + break; + case XFS_DAS_LEAF_SET_RMT: + case XFS_DAS_NODE_SET_RMT: + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; + attr->xattri_dela_state++; fallthrough; - case XFS_DAS_ALLOC_NODE: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - dac->dela_state = XFS_DAS_ALLOC_NODE; - if (args->rmtblkno > 0) { - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - } - /* - * If this was not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - goto out; - } + case XFS_DAS_LEAF_ALLOC_RMT: + case XFS_DAS_NODE_ALLOC_RMT: + error = xfs_attr_rmtval_alloc(attr); + if (error) + return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + break; + goto next_state; + case XFS_DAS_LEAF_REPLACE: + case XFS_DAS_NODE_REPLACE: /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * We must "flip" the incomplete flags on the "new" and "old" + * attribute/value pairs so that one disappears and one appears + * atomically. */ error = xfs_attr3_leaf_flipflags(args); if (error) - goto out; + return error; /* - * Commit the flag value change and start the next trans in - * series + * We must commit the flag value change now to make it atomic + * and then we can start the next trans in series at REMOVE_OLD. */ - dac->dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + break; - case XFS_DAS_FLIP_NFLAG: + case XFS_DAS_LEAF_REMOVE_OLD: + case XFS_DAS_NODE_REMOVE_OLD: /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). + * If we have a remote attr, start the process of removing it + * by invalidating any cached buffers. + * + * If we don't have a remote attr, we skip the remote block + * removal state altogether with a second state increment. */ xfs_attr_restore_rmt_blk(args); - - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - fallthrough; - case XFS_DAS_RM_NBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - + error = xfs_attr_rmtval_invalidate(args); if (error) return error; + } else { + attr->xattri_dela_state++; + } - dac->dela_state = XFS_DAS_CLR_FLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + goto next_state; + + case XFS_DAS_LEAF_REMOVE_RMT: + case XFS_DAS_NODE_REMOVE_RMT: + error = xfs_attr_rmtval_remove(attr); + if (error == -EAGAIN) { + error = 0; + break; } + if (error) + return error; - fallthrough; - case XFS_DAS_CLR_FLAG: /* - * The last state for node format. Look up the old attr and - * remove it. + * We've finished removing the remote attr blocks, so commit the + * transaction and move on to removing the attr name from the + * leaf/node block. Removing the attr might require a full + * transaction reservation for btree block freeing, so we + * can't do that in the same transaction where we removed the + * remote attr blocks. */ - error = xfs_attr_node_addname_clear_incomplete(dac); + attr->xattri_dela_state++; + break; + + case XFS_DAS_LEAF_REMOVE_ATTR: + error = xfs_attr_leaf_remove_attr(attr); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + + case XFS_DAS_NODE_REMOVE_ATTR: + error = xfs_attr_node_remove_attr(attr); + if (!error) + error = xfs_attr_leaf_shrink(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); break; default: ASSERT(0); break; } -out: + + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return error; } @@ -648,6 +868,7 @@ xfs_attr_lookup( { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; if (!xfs_inode_hasattr(dp)) @@ -665,33 +886,85 @@ xfs_attr_lookup( return error; } - return xfs_attr_node_hasname(args, NULL); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); + return error; +} + +static int +xfs_attr_intent_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ +{ + + struct xfs_attr_intent *new; + + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); + new->xattri_op_flags = op_flags; + new->xattri_da_args = args; + + *attr = new; + return 0; } -/* - * Remove the attribute specified in @args. - */ -int -xfs_attr_remove_args( +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_add( struct xfs_da_args *args) { - int error; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_attr_intent *new; + int error = 0; - do { - error = xfs_attr_remove_iter(&dac); - if (error != -EAGAIN) - break; + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) - return error; + new->xattri_dela_state = xfs_attr_init_add_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - } while (true); + return 0; +} - return error; +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_replace( + struct xfs_da_args *args) +{ + struct xfs_attr_intent *new; + int error = 0; + + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_replace_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); + + return 0; +} + +/* Removes an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_remove( + struct xfs_da_args *args) +{ + + struct xfs_attr_intent *new; + int error; + + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_remove_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); + + return 0; } /* @@ -724,14 +997,14 @@ xfs_attr_set( /* * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. + * removal to fail as well. Preserve the logged flag, since we need + * to pass that through to the logging code. */ - args->op_flags = XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_OKNOENT | + (args->op_flags & XFS_DA_OP_LOGGED); if (args->value) { XFS_STATS_INC(mp, xs_attr_set); - - args->op_flags |= XFS_DA_OP_ADDNAME; args->total = xfs_attr_calc_size(args, &local); /* @@ -748,20 +1021,10 @@ xfs_attr_set( return error; } - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - total = args->total; - if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); - - tres = M_RES(mp)->tr_attrrm; - total = XFS_ATTRRM_SPACE_RES(mp); rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } @@ -769,6 +1032,7 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ + xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; @@ -776,33 +1040,43 @@ xfs_attr_set( if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(args->trans, dp, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } error = xfs_attr_lookup(args); - if (args->value) { - if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) - goto out_trans_cancel; - if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_trans_cancel; - if (error != -ENOATTR && error != -EEXIST) + switch (error) { + case -EEXIST: + /* if no value, we are performing a remove operation */ + if (!args->value) { + error = xfs_attr_defer_remove(args); + break; + } + /* Pure create fails if the attr already exists */ + if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - error = xfs_attr_set_args(args); - if (error) - goto out_trans_cancel; - /* shortform attribute has already been committed */ - if (!args->trans) - goto out_unlock; - } else { - if (error != -EEXIST) + error = xfs_attr_defer_replace(args); + break; + case -ENOATTR: + /* Can't remove what isn't there. */ + if (!args->value) goto out_trans_cancel; - error = xfs_attr_remove_args(args); - if (error) + /* Pure replace fails if no existing attr to replace. */ + if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; + + error = xfs_attr_defer_add(args); + break; + default: + goto out_trans_cancel; } + if (error) + goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -845,28 +1119,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) * Add a name to the shortform attribute list structure * This is the external routine. */ -STATIC int -xfs_attr_shortform_addname(xfs_da_args_t *args) +static int +xfs_attr_shortform_addname( + struct xfs_da_args *args) { - int newsize, forkoff, retval; + int newsize, forkoff; + int error; trace_xfs_attr_sf_addname(args); - retval = xfs_attr_shortform_lookup(args); - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - return retval; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) - return retval; - retval = xfs_attr_sf_removename(args); - if (retval) - return retval; + error = xfs_attr_shortform_lookup(args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + return error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + return error; + + error = xfs_attr_sf_removename(args); + if (error) + return error; + /* - * Since we have removed the old attr, clear ATTR_REPLACE so - * that the leaf format add routine won't trip over the attr - * not being around. + * Since we have removed the old attr, clear XFS_DA_OP_REPLACE + * so that the new attr doesn't fit in shortform format, the + * leaf format add routine won't trip over the attr not being + * around. */ - args->attr_flags &= ~XATTR_REPLACE; + args->op_flags &= ~XFS_DA_OP_REPLACE; + break; + case 0: + break; + default: + return error; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -889,8 +1176,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ -/* Store info about a remote block */ -STATIC void +/* Save the current remote block info and clear the current pointers. */ +static void xfs_attr_save_rmt_blk( struct xfs_da_args *args) { @@ -899,10 +1186,13 @@ xfs_attr_save_rmt_blk( args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* Set stored info about a remote block */ -STATIC void +static void xfs_attr_restore_rmt_blk( struct xfs_da_args *args) { @@ -928,45 +1218,54 @@ xfs_attr_leaf_try_add( struct xfs_da_args *args, struct xfs_buf *bp) { - int retval; + int error; /* - * Look up the given attribute in the leaf block. Figure out if - * the given flags produce an error or call for an atomic rename. + * If the caller provided a buffer to us, it is locked and held in + * the transaction because it just did a shortform to leaf conversion. + * Hence we don't need to read it again. Otherwise read in the leaf + * buffer. */ - retval = xfs_attr_leaf_hasname(args, &bp); - if (retval != -ENOATTR && retval != -EEXIST) - return retval; - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_brelse; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + if (bp) { + xfs_trans_bhold_release(args->trans, bp); + } else { + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + } + + /* + * Look up the xattr name to set the insertion point for the new xattr. + */ + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto out_brelse; trace_xfs_attr_leaf_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - xfs_attr_save_rmt_blk(args); - /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - /* - * Add the attribute to the leaf block - */ return xfs_attr3_leaf_add(bp, args); out_brelse: xfs_trans_brelse(args->trans, bp); - return retval; + return error; } /* @@ -1012,9 +1311,10 @@ xfs_attr_leaf_removename( dp = args->dp; error = xfs_attr_leaf_hasname(args, &bp); - if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; return error; } else if (error != -EEXIST) return error; @@ -1062,32 +1362,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Return EEXIST if attr is found, or ENOATTR if not - * statep: If not null is set to point at the found state. Caller will - * be responsible for freeing the state in this case. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_hasname( +xfs_attr_node_lookup( struct xfs_da_args *args, - struct xfs_da_state **statep) + struct xfs_da_state *state) { - struct xfs_da_state *state; int retval, error; - state = xfs_da_state_alloc(args); - if (statep != NULL) - *statep = state; - /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); if (error) - retval = error; - - if (!statep) - xfs_da_state_free(state); + return error; return retval; } @@ -1098,46 +1386,48 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - int retval; + struct xfs_da_args *args = attr->xattri_da_args; + int error; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, &dac->da_state); - if (retval != -ENOATTR && retval != -EEXIST) - goto error; - - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto error; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto error; - trace_xfs_attr_node_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - xfs_attr_save_rmt_blk(args); + trace_xfs_attr_node_replace(args); /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto error; } return 0; error: - if (dac->da_state) - xfs_da_state_free(dac->da_state); - return retval; + if (attr->xattri_da_state) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } + return error; } /* @@ -1146,25 +1436,16 @@ error: * This will involve walking down the Btree, and may involve splitting * leaf nodes and even splitting intermediate nodes up to and including * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - * - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - *returned. */ -STATIC int -xfs_attr_node_addname( - struct xfs_delattr_context *dac) +static int +xfs_attr_node_try_addname( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; + struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; - trace_xfs_attr_node_addname(args); + trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1175,25 +1456,9 @@ xfs_attr_node_addname( /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* - * have been a b-tree. - */ - xfs_da_state_free(state); - state = NULL; - error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - - /* - * Now that we have converted the leaf to a node, we can - * roll the transaction, and try xfs_attr3_leaf_add - * again on re-entry. No need to set dela_state to do - * this. dela_state is still unset by this function at - * this point. + * have been a b-tree. Let the caller deal with this. */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_node_addname_return( - dac->dela_state, args->dp); - return -EAGAIN; + goto out; } /* @@ -1205,7 +1470,6 @@ xfs_attr_node_addname( error = xfs_da3_split(state); if (error) goto out; - dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1214,148 +1478,12 @@ xfs_attr_node_addname( } out: - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); + attr->xattri_da_state = NULL; return error; } - -STATIC int -xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = NULL; - int retval = 0; - int error = 0; - - /* - * Re-find the "old" attribute entry after any split ops. The INCOMPLETE - * flag means that we will find the "old" attr, not the "new" one. - */ - args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); - if (error) - goto out; - - error = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - } - retval = error = 0; - -out: - if (state) - xfs_da_state_free(state); - if (error) - return error; - return retval; -} - -/* - * Shrink an attribute from leaf to shortform - */ -STATIC int -xfs_attr_node_shrink( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_inode *dp = args->dp; - int error, forkoff; - struct xfs_buf *bp; - - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - } else - xfs_trans_brelse(args->trans, bp); - - return error; -} - -/* - * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers - * for later deletion of the entry. - */ -STATIC int -xfs_attr_leaf_mark_incomplete( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - int error; - - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - return error; - - /* - * Mark the attribute as INCOMPLETE - */ - return xfs_attr3_leaf_setflag(args); -} - -/* - * Initial setup for xfs_attr_node_removename. Make sure the attr is there and - * the blocks are valid. Attr keys with remote blocks will be marked - * incomplete. - */ -STATIC -int xfs_attr_node_removename_setup( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state **state = &dac->da_state; - int error; - - error = xfs_attr_node_hasname(args, state); - if (error != -EEXIST) - goto out; - error = 0; - - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == - XFS_ATTR_LEAF_MAGIC); - - if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, *state); - if (error) - goto out; - - error = xfs_attr_rmtval_invalidate(args); - } -out: - if (error) - xfs_da_state_free(*state); - - return error; -} - -STATIC int +static int xfs_attr_node_removename( struct xfs_da_args *args, struct xfs_da_state *state) @@ -1374,246 +1502,42 @@ xfs_attr_node_removename( return retval; } -/* - * Remove the attribute specified in @args. - * - * This will involve walking down the Btree, and may involve joining - * leaf nodes and even joining intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * This routine is meant to function as either an in-line or delayed operation, - * and may return -EAGAIN when the transaction needs to be rolled. Calling - * functions will need to handle this, and call the function until a - * successful error code is returned. - */ -int -xfs_attr_remove_iter( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; - int retval, error = 0; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_node_removename(args); - - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - - /* - * Shortform or leaf formats don't require transaction rolls and - * thus state transitions. Call the right helper and return. - */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_removename(args); - - if (xfs_attr_is_leaf(dp)) - return xfs_attr_leaf_removename(args); - - /* - * Node format may require transaction rolls. Set up the - * state context and fall into the state machine. - */ - if (!dac->da_state) { - error = xfs_attr_node_removename_setup(dac); - if (error) - return error; - state = dac->da_state; - } - - fallthrough; - case XFS_DAS_RMTBLK: - dac->dela_state = XFS_DAS_RMTBLK; - - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a - * deadlock. - */ - if (args->rmtblkno > 0) { - /* - * May return -EAGAIN. Roll and repeat until all remote - * blocks are removed. - */ - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) { - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return error; - } else if (error) { - goto out; - } - - /* - * Refill the state structure with buffers (the prior - * calls released our buffers) and close out this - * transaction before proceeding. - */ - ASSERT(args->rmtblkno == 0); - error = xfs_attr_refillstate(state); - if (error) - goto out; - dac->dela_state = XFS_DAS_RM_NAME; - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_NAME: - /* - * If we came here fresh from a transaction roll, reattach all - * the buffers to the current transaction. - */ - if (dac->dela_state == XFS_DAS_RM_NAME) { - error = xfs_attr_refillstate(state); - if (error) - goto out; - } - - retval = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. If so, roll - * the transacton and fall into the shrink state. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - - dac->flags |= XFS_DAC_DEFER_FINISH; - dac->dela_state = XFS_DAS_RM_SHRINK; - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_SHRINK: - /* - * If the result is small enough, push it all into the inode. - * This is our final state so it's safe to return a dirty - * transaction. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); - ASSERT(error != -EAGAIN); - break; - default: - ASSERT(0); - error = -EINVAL; - goto out; - } -out: - if (state) - xfs_da_state_free(state); - return error; -} - -/* - * Fill in the disk block numbers in the state structure for the buffers - * that are attached to the state structure. - * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commits have released these buffers. - */ -STATIC int -xfs_attr_fillstate(xfs_da_state_t *state) +static int +xfs_attr_node_remove_attr( + struct xfs_attr_intent *attr) { - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level; - - trace_xfs_attr_fillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = xfs_da_state_alloc(args); + int retval = 0; + int error = 0; /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". + * The attr we are removing has already been marked incomplete, so + * we need to set the filter appropriately to re-find the "old" + * attribute entry after any split ops. */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - return 0; -} - -/* - * Reattach the buffers to the state structure based on the disk block - * numbers stored in the state structure. - * This is done after some set of transaction commits have released those - * buffers from our grip. - */ -STATIC int -xfs_attr_refillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level, error; - - trace_xfs_attr_refillstate(state->args); + args->attr_filter |= XFS_ATTR_INCOMPLETE; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } + error = xfs_attr_node_removename(args, state); /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". + * Check to see if the tree needs to be collapsed. */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); + if (error) + goto out; } + retval = error = 0; - return 0; +out: + xfs_da_state_free(state); + if (error) + return error; + return retval; } /* @@ -1639,7 +1563,8 @@ xfs_attr_node_get( /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_attr_node_hasname(args, &state); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); if (error != -EEXIST) goto out_release; @@ -1658,8 +1583,7 @@ out_release: state->path.blk[i].bp = NULL; } - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } @@ -1679,3 +1603,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 5e71f719bdd5..b4a2fc77017e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -425,7 +425,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_delattr_context.da_state + * Enum values for xfs_attr_intent.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -434,46 +434,107 @@ struct xfs_attr_list_context { * to where it was and resume executing where it left off. */ enum xfs_delattr_state { - XFS_DAS_UNINIT = 0, /* No state has been set yet */ - XFS_DAS_RMTBLK, /* Removing remote blks */ - XFS_DAS_RM_NAME, /* Remove attr name */ - XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ - XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ - XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ - XFS_DAS_RD_LEAF, /* Read in the new leaf */ - XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ - XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ - XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ - XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + + /* + * Initial sequence states. The replace setup code relies on the + * ADD and REMOVE states for a specific format to be sequential so + * that we can transform the initial operation to be performed + * according to the xfs_has_larp() state easily. + */ + XFS_DAS_SF_ADD, /* Initial sf add state */ + XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */ + + XFS_DAS_LEAF_ADD, /* Initial leaf add state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */ + + XFS_DAS_NODE_ADD, /* Initial node add state */ + XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */ + + /* Leaf state set/replace/remove sequence */ + XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ + XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ + XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ + XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ + + /* Node state sequence, must match leaf state above */ + XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ + XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ + XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */ + + XFS_DAS_DONE, /* finished operation */ }; -/* - * Defines for xfs_delattr_context.flags - */ -#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ +#define XFS_DAS_STRINGS \ + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ + { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ + { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ + { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ + { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ + { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ + { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } + +struct xfs_attri_log_nameval; /* * Context used for keeping track of delayed attribute operations */ -struct xfs_delattr_context { - struct xfs_da_args *da_args; - - /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; +struct xfs_attr_intent { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *da_state; + struct xfs_da_state *xattri_da_state; + + struct xfs_da_args *xattri_da_args; + + /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; + + /* + * Used by xfs_attr_set to hold a leaf buffer across a transaction roll + */ + struct xfs_buf *xattri_leaf_bp; /* Used to keep track of current state of delayed operation */ - unsigned int flags; - enum xfs_delattr_state dela_state; + enum xfs_delattr_state xattri_dela_state; + + /* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* + */ + unsigned int xattri_op_flags; + + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; }; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -489,11 +550,77 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_remove_iter(struct xfs_delattr_context *dac); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); -void xfs_delattr_context_init(struct xfs_delattr_context *dac, - struct xfs_da_args *args); +int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +static inline enum xfs_delattr_state +xfs_attr_init_add_state(struct xfs_da_args *args) +{ + /* + * When called from the completion of a attr remove to determine the + * next state, the attribute fork may be null. This can occur only occur + * on a pure remove, but we grab the next state before we check if a + * replace operation is being performed. If we are called from any other + * context, i_afp is guaranteed to exist. Hence if the attr fork is + * null, we were called from a pure remove operation and so we are done. + */ + if (!args->dp->i_afp) + return XFS_DAS_DONE; + + args->op_flags |= XFS_DA_OP_ADDNAME; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_ADD; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_ADD; + return XFS_DAS_NODE_ADD; +} + +static inline enum xfs_delattr_state +xfs_attr_init_remove_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_REMOVE; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_REMOVE; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_REMOVE; + return XFS_DAS_NODE_REMOVE; +} + +/* + * If we are logging the attributes, then we have to start with removal of the + * old attribute so that there is always consistent state that we can recover + * from if the system goes down part way through. We always log the new attr + * value, so even when we remove the attr first we still have the information in + * the log to finish the replace operation atomically. + */ +static inline enum xfs_delattr_state +xfs_attr_init_replace_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; + if (args->op_flags & XFS_DA_OP_LOGGED) + return xfs_attr_init_remove_state(args); + return xfs_attr_init_add_state(args); +} + +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 014daa8c542d..37e7c33f6283 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -28,6 +28,7 @@ #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_errortag.h" /* @@ -310,6 +311,15 @@ xfs_attr3_leaf_verify( return fa; /* + * Empty leaf blocks should never occur; they imply the existence of a + * software bug that needs fixing. xfs_repair also flags them as a + * corruption that needs fixing, so we should never let these go to + * disk. + */ + if (ichdr.count == 0) + return __this_address; + + /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -445,6 +455,14 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ +/* + * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE + * flag on disk - if there's an incomplete attr then recovery needs to tear it + * down. If there's no incomplete attr, then recovery needs to tear that attr + * down to replace it with the attr that has been logged. In this case, the + * INCOMPLETE flag will not be set in attr->attr_filter, but rather + * XFS_DA_OP_RECOVERY will be set in args->op_flags. + */ static bool xfs_attr_match( struct xfs_da_args *args, @@ -452,14 +470,18 @@ xfs_attr_match( unsigned char *name, int flags) { + if (args->namelen != namelen) return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* - * If we are looking for incomplete entries, show only those, else only - * show complete entries. - */ + + /* Recovery ignores the INCOMPLETE flag. */ + if ((args->op_flags & XFS_DA_OP_RECOVERY) && + args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return true; + + /* All remaining matches need to be filtered by INCOMPLETE state. */ if (args->attr_filter != (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) return false; @@ -798,6 +820,14 @@ xfs_attr_sf_removename( sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); + + /* + * If we are recovering an operation, finding nothing to + * remove is not an error - it just means there was nothing + * to clean up. + */ + if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) + return 0; if (error != -EEXIST) return error; size = xfs_attr_sf_entsize(sfe); @@ -818,7 +848,7 @@ xfs_attr_sf_removename( totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & XFS_DA_OP_ADDNAME)) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(xfs_has_attr2(dp->i_mount)); - ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_remove(dp, args->trans); + /* + * Don't remove the attr fork if this operation is the first + * part of a attr replace operations. We're going to add a new + * attr immediately, so we need to keep the attr fork around in + * this case. + */ + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + ASSERT(xfs_has_attr2(dp->i_mount)); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + } goto out; } @@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + error = -EIO; + goto out; + } + error = xfs_da_grow_inode(args, &blkno); if (error) goto out; @@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work( entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; - if (args->op_flags & XFS_DA_OP_RENAME) { - entry->flags |= XFS_ATTR_INCOMPLETE; + if (args->op_flags & XFS_DA_OP_REPLACE) { + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 83b95be9ded8..7298c148f848 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,14 +568,14 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_bmbt_irec *map = &attr->xattri_map; int error; - dac->lblkno = 0; - dac->blkcnt = 0; + attr->xattri_lblkno = 0; + attr->xattri_blkcnt = 0; args->rmtblkcnt = 0; args->rmtblkno = 0; memset(map, 0, sizeof(struct xfs_bmbt_irec)); @@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space( if (error) return error; - dac->blkcnt = args->rmtblkcnt; - dac->lblkno = args->rmtblkno; + attr->xattri_blkcnt = args->rmtblkcnt; + attr->xattri_lblkno = args->rmtblkno; return 0; } @@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_bmbt_irec *map = &attr->xattri_map; int nmap; int error; nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, - dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + error = xfs_bmapi_write(args->trans, dp, + (xfs_fileoff_t)attr->xattri_lblkno, + attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total, map, &nmap); if (error) return error; @@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk( (map->br_startblock != HOLESTARTBLOCK)); /* roll attribute extent map forwards */ - dac->lblkno += map->br_blockcount; - dac->blkcnt -= map->br_blockcount; + attr->xattri_lblkno += map->br_blockcount; + attr->xattri_blkcnt -= map->br_blockcount; return 0; } @@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; int error, done; /* @@ -695,8 +696,8 @@ xfs_attr_rmtval_remove( * the parent */ if (!done) { - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); + trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d72eff30ca18..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); -int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74198dd82b03..6833110d1bd4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { + uint64_t maxblocks; /* max blocks at this level */ + xfs_extnum_t maxleafents; /* max leaf entries possible */ int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum number of - * leaf entries, is controlled by the size of the on-disk extent count, - * either a signed 32-bit number for the data fork, or a signed 16-bit - * number for the attr fork. + * The maximum number of extents in a fork, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count. * * Note that we can no longer assume that if we are in ATTR1 that the * fork offset of all the inodes will be @@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels( * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; + maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; + else sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); @@ -468,7 +466,7 @@ error0: if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -485,7 +483,7 @@ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) @@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ @@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; @@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; /* @@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, whichfork); @@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay( */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* @@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, - int flags) + uint32_t flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); @@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align( /* * For large extent hint sizes, the aligned extent might be larger than - * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls - * the length back under MAXEXTLEN. The outer allocation loops handle - * short allocation just fine, so it is safe to do this. We only want to - * do it when we are forced to, though, because it means more allocation - * operations are required. + * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so + * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer + * allocation loops handle short allocation just fine, so it is safe to + * do this. We only want to do it when we are forced to, though, because + * it means more allocation operations are required. */ - while (align_alen > MAXEXTLEN) + while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; - ASSERT(align_alen <= MAXEXTLEN); + ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation @@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - /* see MAXEXTLEN handling above */ + /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || - align_alen + extsz > MAXEXTLEN); + align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG @@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map( xfs_fileoff_t obno, xfs_fileoff_t end, int n, - int flags) + uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { @@ -3811,7 +3809,7 @@ xfs_bmapi_update_map( xfs_fileoff_t obno, xfs_fileoff_t end, int *n, - int flags) + uint32_t flags) { xfs_bmbt_irec_t *mval = *map; @@ -3864,7 +3862,7 @@ xfs_bmapi_read( xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); @@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc( * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ - alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) @@ -4104,7 +4102,7 @@ xfs_bmapi_allocate( if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) bma->prev.br_startoff = NULLFILEOFF; } else { - bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); if (!bma->eof) bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); @@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags) + uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); @@ -4312,7 +4310,7 @@ xfs_bmapi_write( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ + uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ @@ -4424,8 +4422,8 @@ xfs_bmapi_write( * xfs_extlen_t and therefore 32 bits. Hence we have to * check for 32-bit overflows and handle them here. */ - if (len > (xfs_filblks_t)MAXEXTLEN) - bma.length = MAXEXTLEN; + if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) + bma.length = XFS_MAX_BMBT_EXTLEN; else bma.length = len; @@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_may_overflow(ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc( bma.ip = ip; bma.wasdel = true; bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, + XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* @@ -4629,7 +4630,7 @@ xfs_bmapi_remap( xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4641,7 +4642,7 @@ xfs_bmapi_remap( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); - ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); @@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay( int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); int error = 0; bool isrt; @@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; - int state = BMAP_COWFORK; + uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); @@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ - int bflags) /* bmapi flags */ + uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ @@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; mp = ip->i_mount; @@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real( * Deleting the middle of the extent. */ - /* - * For directories, -ENOSPC is returned since a directory entry - * remove operation must not fail due to low extent count - * availability. -ENOSPC will be handled by higher layers of XFS - * by letting the corresponding empty Data/Free blocks to linger - * until a future remove operation. Dabtree blocks would be - * swapped with the last block in the leaf space and then the - * new last block will be unmapped. - * - * The above logic also applies to the source directory entry of - * a rename operation. - */ - error = xfs_iext_count_may_overflow(ip, whichfork, 1); - if (error) { - ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && - whichfork == XFS_DATA_FORK); - error = -ENOSPC; - goto done; - } - old = got; got.br_blockcount = del->br_startoff - got.br_startoff; @@ -5281,7 +5262,7 @@ __xfs_bunmapi( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ - int flags, /* misc flags */ + uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ @@ -5299,7 +5280,6 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ - xfs_fileoff_t max_len; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5318,16 +5298,6 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - /* - * Guesstimate how many blocks we can unmap without running the risk of - * blowing out the transaction with a mix of EFIs and reflink - * adjustments. - */ - if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) - max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); - else - max_len = len; - error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -5366,7 +5336,7 @@ __xfs_bunmapi( extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && - (nexts == 0 || extno < nexts) && max_len > 0) { + (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. @@ -5400,14 +5370,6 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - /* How much can we safely unmap? */ - if (max_len < del.br_blockcount) { - del.br_startoff += del.br_blockcount - max_len; - if (!wasdel) - del.br_startblock += del.br_blockcount - max_len; - del.br_blockcount = max_len; - } - if (!isrt) goto delete; @@ -5543,7 +5505,6 @@ delete: if (error) goto error0; - max_len -= del.br_blockcount; end = del.br_startoff - 1; nodelete: /* @@ -5609,7 +5570,7 @@ xfs_bunmapi( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_extnum_t nexts, int *done) { @@ -5641,7 +5602,7 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) return false; return true; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 03d9aaf87413..16db95b11589 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -39,7 +39,7 @@ struct xfs_bmalloca { bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ - int flags; + uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 @@ -47,17 +47,17 @@ struct xfs_bmalloca { /* * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ -#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ +#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ +#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents @@ -65,7 +65,7 @@ struct xfs_bmalloca { * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ -#define XFS_BMAPI_ZERO 0x080 +#define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily @@ -75,16 +75,16 @@ struct xfs_bmalloca { * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ -#define XFS_BMAPI_REMAP 0x100 +#define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ -#define XFS_BMAPI_COWFORK 0x200 +#define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ -#define XFS_BMAPI_NODISCARD 0x1000 +#define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ -#define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w) (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } -static inline int xfs_bmapi_whichfork(int bmapi_flags) +static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; @@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* * Flags for xfs_bmap_add_extent*. */ -#define BMAP_LEFT_CONTIG (1 << 0) -#define BMAP_RIGHT_CONTIG (1 << 1) -#define BMAP_LEFT_FILLING (1 << 2) -#define BMAP_RIGHT_FILLING (1 << 3) -#define BMAP_LEFT_DELAY (1 << 4) -#define BMAP_RIGHT_DELAY (1 << 5) -#define BMAP_LEFT_VALID (1 << 6) -#define BMAP_RIGHT_VALID (1 << 7) -#define BMAP_ATTRFORK (1 << 8) -#define BMAP_COWFORK (1 << 9) +#define BMAP_LEFT_CONTIG (1u << 0) +#define BMAP_RIGHT_CONTIG (1u << 1) +#define BMAP_LEFT_FILLING (1u << 2) +#define BMAP_RIGHT_FILLING (1u << 3) +#define BMAP_LEFT_DELAY (1u << 4) +#define BMAP_RIGHT_DELAY (1u << 5) +#define BMAP_LEFT_VALID (1u << 6) +#define BMAP_RIGHT_VALID (1u << 7) +#define BMAP_ATTRFORK (1u << 8) +#define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ @@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); + int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, @@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -static inline int xfs_bmap_fork_to_state(int whichfork) +static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: @@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags); + uint32_t flags); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 453309fc85f2..2b77d45c215f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -597,7 +597,11 @@ xfs_bmbt_maxrecs( return xfs_bmbt_block_maxrecs(blocklen, leaf); } -/* Compute the max possible height for block mapping btrees. */ +/* + * Calculate the maximum possible height of the btree that the on-disk format + * supports. This is used for sizing structures large enough to support every + * possible configuration of a filesystem that might get mounted. + */ unsigned int xfs_bmbt_maxlevels_ondisk(void) { @@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void) minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; /* One extra level for the inode root. */ - return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1; + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1; } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c1500b238520..2eecc49fc1b2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -52,6 +52,71 @@ xfs_btree_magic( } /* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + __be64 dsibling) +{ + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + __be32 dsibling) +{ + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) + return NULL; + + sibling = be32_to_cpu(dsibling); + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(mp, agno, sibling)) + return __this_address; + } + return NULL; +} + +/* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ @@ -65,6 +130,8 @@ __xfs_btree_check_lblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -83,16 +150,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /* Check a long btree block header. */ @@ -130,6 +197,9 @@ __xfs_btree_check_sblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -146,16 +216,18 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) { + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); + } + + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, + agbno, block->bb_u.s.bb_rightsib); + return fa; } /* Check a short btree block header. */ @@ -373,8 +445,14 @@ xfs_btree_del_cursor( break; } + /* + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. + */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - xfs_is_shutdown(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -751,20 +829,20 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - int64_t imask; /* mask for current bit number */ + uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ - for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; @@ -773,7 +851,7 @@ xfs_btree_offsets( /* * Find the highest bit, so the last byte offset. */ - for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; @@ -1456,7 +1534,7 @@ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ + uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ @@ -3194,7 +3272,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3274,7 +3352,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3294,7 +3372,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3379,6 +3457,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -4271,6 +4351,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4445,20 +4540,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /** @@ -4499,7 +4595,9 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) @@ -4507,14 +4605,13 @@ xfs_btree_sblock_verify( /* sibling pointer verification */ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 22d9f411fde6..eef27858a013 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ -#define XFS_BB_MAGIC (1 << 0) -#define XFS_BB_LEVEL (1 << 1) -#define XFS_BB_NUMRECS (1 << 2) -#define XFS_BB_LEFTSIB (1 << 3) -#define XFS_BB_RIGHTSIB (1 << 4) -#define XFS_BB_BLKNO (1 << 5) -#define XFS_BB_LSN (1 << 6) -#define XFS_BB_UUID (1 << 7) -#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_MAGIC (1u << 0) +#define XFS_BB_LEVEL (1u << 1) +#define XFS_BB_NUMRECS (1u << 2) +#define XFS_BB_LEFTSIB (1u << 3) +#define XFS_BB_RIGHTSIB (1u << 4) +#define XFS_BB_BLKNO (1u << 5) +#define XFS_BB_LSN (1u << 6) +#define XFS_BB_UUID (1u << 7) +#define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 -#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) +#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -345,7 +345,7 @@ xfs_btree_dup_cursor( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ -void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9dc1ecb9713d..e7201dc68f43 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_errortag.h" /* * xfs_da_btree.c @@ -116,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_cache_free(xfs_da_state_cache, state); } +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; +} + static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) { if (whichfork == XFS_DATA_FORK) @@ -482,6 +494,9 @@ xfs_da3_split( trace_xfs_da_split(state->args); + if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + return -EIO; + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0faf7d9ac241..ffa3df5b2893 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -30,6 +30,7 @@ struct xfs_da_geometry { unsigned int free_hdr_size; /* dir2 free header size */ unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + xfs_extnum_t max_extents; /* Max. extents in corresponding fork */ xfs_dir2_data_aoff_t data_first_offset; size_t data_entry_offset; @@ -76,27 +77,33 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - int op_flags; /* operation flags */ + uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* * Operation flags: */ -#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ -#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ -#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ -#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ -#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ +#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ +#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */ +#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ +#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ +#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ +#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ +#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ - { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_REPLACE, "REPLACE" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" } + { XFS_DA_OP_NOTIME, "NOTIME" }, \ + { XFS_DA_OP_REMOVE, "REMOVE" }, \ + { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ + { XFS_DA_OP_LOGGED, "LOGGED" } /* * Storage for holding state during Btree searches and split/join ops. @@ -197,7 +204,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, * Utility routines. */ -#define XFS_DABUF_MAP_HOLE_OK (1 << 0) +#define XFS_DABUF_MAP_HOLE_OK (1u << 0) int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, @@ -220,6 +227,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5a49caa5c9df..25e2841084e1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) * Directory address space divided into sections, * spaces separated by 32GB. */ +#define XFS_DIR2_MAX_SPACES 3 #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) @@ -688,10 +689,10 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) +#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) /* diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 0805ade2d300..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -22,6 +22,10 @@ #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" +#include "xfs_buf.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -184,36 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, + [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static void +/* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; - if (!dfp->dfp_intent) - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (dfp->dfp_intent) + return 1; + + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -STATIC void +static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - xfs_defer_create_intent(tp, dfp, true); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } + return ret; } /* Abort all the intents that were committed. */ @@ -449,6 +478,8 @@ xfs_defer_finish_one( dfp->dfp_count--; error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { + int ret; + /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to @@ -459,7 +490,9 @@ xfs_defer_finish_one( dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; - xfs_defer_create_intent(tp, dfp, false); + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; } if (error) @@ -487,7 +520,7 @@ int xfs_defer_finish_noroll( struct xfs_trans **tp) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); @@ -506,17 +539,24 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - xfs_defer_create_intents(*tp); - list_splice_init(&(*tp)->t_dfops, &dop_pending); + int has_intents = xfs_defer_create_intents(*tp); - error = xfs_defer_trans_roll(tp); - if (error) - goto out_shutdown; + list_splice_init(&(*tp)->t_dfops, &dop_pending); - /* Possibly relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) + if (has_intents < 0) { + error = has_intents; goto out_shutdown; + } + if (has_intents || dfp) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + + /* Relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + } dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); @@ -665,13 +705,15 @@ xfs_defer_ops_capture( if (list_empty(&tp->t_dfops)) return NULL; + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + /* Create an object to capture the defer ops. */ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); - xfs_defer_create_intents(tp); - /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; @@ -748,6 +790,10 @@ xfs_defer_ops_capture_and_commit( /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } if (!dfc) return xfs_trans_commit(tp); @@ -774,17 +820,25 @@ xfs_defer_ops_continue( struct xfs_trans *tp, struct xfs_defer_resources *dres) { + unsigned int i; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); - /* Lock and join the captured inode to the new transaction. */ + /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_lock(dfc->dfc_held.dr_bp[i]); + + /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); + dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); @@ -854,7 +908,9 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - + error = xfs_attr_intent_init_cache(); + if (error) + goto err; return 0; err: xfs_defer_destroy_item_caches(); @@ -865,6 +921,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7bb8a31ad65b..114a3a4930a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -19,6 +19,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, + XFS_DEFER_OPS_TYPE_ATTR, XFS_DEFER_OPS_TYPE_MAX, }; @@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_attr_defer_type; + /* * Deferred operation item relogging limits. diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 5f1e4799e8fa..3cd51fa3837b 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -150,6 +150,8 @@ xfs_da_mount( dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> + mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ @@ -161,6 +163,12 @@ xfs_da_mount( dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + + if (xfs_has_large_extent_counts(mp)) + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + else + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a23a52e643ad..5362908164b0 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -59,7 +59,10 @@ #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 -#define XFS_ERRTAG_MAX 39 +#define XFS_ERRTAG_LARP 39 +#define XFS_ERRTAG_DA_LEAF_SPLIT 40 +#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 +#define XFS_ERRTAG_MAX 42 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -103,5 +106,8 @@ #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 +#define XFS_RANDOM_LARP 1 +#define XFS_RANDOM_DA_LEAF_SPLIT 1 +#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d665c04e69dd..afdfc8108c5f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ XFS_SB_FEAT_INCOMPAT_META_UUID| \ XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ + XFS_SB_FEAT_INCOMPAT_NREXT64) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature( return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */ +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \ + (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS) #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( @@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } +static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +{ + return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); +} static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) @@ -525,26 +534,26 @@ typedef struct xfs_agf { #define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 -#define XFS_AGF_REFCOUNT_ROOT 0x00008000 -#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 -#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_MAGICNUM (1u << 0) +#define XFS_AGF_VERSIONNUM (1u << 1) +#define XFS_AGF_SEQNO (1u << 2) +#define XFS_AGF_LENGTH (1u << 3) +#define XFS_AGF_ROOTS (1u << 4) +#define XFS_AGF_LEVELS (1u << 5) +#define XFS_AGF_FLFIRST (1u << 6) +#define XFS_AGF_FLLAST (1u << 7) +#define XFS_AGF_FLCOUNT (1u << 8) +#define XFS_AGF_FREEBLKS (1u << 9) +#define XFS_AGF_LONGEST (1u << 10) +#define XFS_AGF_BTREEBLKS (1u << 11) +#define XFS_AGF_UUID (1u << 12) +#define XFS_AGF_RMAP_BLOCKS (1u << 13) +#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14) +#define XFS_AGF_REFCOUNT_ROOT (1u << 15) +#define XFS_AGF_REFCOUNT_LEVEL (1u << 16) +#define XFS_AGF_SPARE64 (1u << 17) #define XFS_AGF_NUM_BITS 18 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ @@ -619,22 +628,22 @@ typedef struct xfs_agi { #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_MAGICNUM (1u << 0) +#define XFS_AGI_VERSIONNUM (1u << 1) +#define XFS_AGI_SEQNO (1u << 2) +#define XFS_AGI_LENGTH (1u << 3) +#define XFS_AGI_COUNT (1u << 4) +#define XFS_AGI_ROOT (1u << 5) +#define XFS_AGI_LEVEL (1u << 6) +#define XFS_AGI_FREECOUNT (1u << 7) +#define XFS_AGI_NEWINO (1u << 8) +#define XFS_AGI_DIRINO (1u << 9) +#define XFS_AGI_UNLINKED (1u << 10) #define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1u << 11) +#define XFS_AGI_FREE_LEVEL (1u << 12) +#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */ #define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ @@ -791,16 +800,41 @@ struct xfs_dinode { __be32 di_nlink; /* number of links to file */ __be16 di_projid_lo; /* lower part of owner's project id */ __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + __be64 di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + __be64 di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + __u8 di_v2_pad[6]; + __be16 di_flushiter; + }; + }; xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ __be64 di_size; /* number of bytes in file */ __be64 di_nblocks; /* # of direct & btree blocks used */ __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + __be32 di_nextents; + __be16 di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + __be32 di_big_anextents; + __be16 di_nrext64_pad; + } __packed; + } __packed; __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ __s8 di_aformat; /* format of attr fork's data */ __be32 di_dmevmask; /* DMIG event mask */ @@ -870,6 +904,56 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_UUID, "uuid" } /* + * Max values for extnum and aextnum. + * + * The original on-disk extent counts were held in signed fields, resulting in + * maximum extent counts of 2^31 and 2^15 for the data and attr forks + * respectively. Similarly the maximum extent length is limited to 2^21 blocks + * by the 21-bit wide blockcount field of a BMBT extent record. + * + * The newly introduced data fork extent counter can hold a 64-bit value, + * however the maximum number of extents in a file is also limited to 2^54 + * extents by the 54-bit wide startoff field of a BMBT extent record. + * + * It is further limited by the maximum supported file size of 2^63 + * *bytes*. This leads to a maximum extent count for maximally sized filesystem + * blocks (64kB) of: + * + * 2^63 bytes / 2^16 bytes per block = 2^47 blocks + * + * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence + * 2^48 was chosen as the maximum data fork extent count. + * + * The maximum file size that can be represented by the data fork extent counter + * in the worst case occurs when all extents are 1 block in length and each + * block is 1KB in size. + * + * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and + * with 1KB sized blocks, a file can reach upto, + * 1KB * (2^31) = 2TB + * + * This is much larger than the theoretical maximum size of a directory + * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB. + * + * Hence, a directory inode can never overflow its data fork extent counter. + */ +#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) +#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) + +/* + * When we upgrade an inode to the large extent counts, the maximum value by + * which the extent count can increase is bound by the change in size of the + * on-disk field. No upgrade operation should ever be adding more than a few + * tens of extents, so if we get a really large value it is a sign of a code bug + * or corruption. + */ +#define XFS_MAX_EXTCNT_UPGRADE_NR \ + min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \ + XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL) + +/* * Inode minimum and maximum sizes. */ #define XFS_DINODE_MIN_LOG 8 @@ -918,10 +1002,6 @@ enum xfs_dinode_fmt { ((w) == XFS_DATA_FORK ? \ (dip)->di_format : \ (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) /* * For block and character special files the 32bit dev_t is stored at the @@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); } +static inline bool xfs_dinode_has_large_extent_counts( + const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ -#define XFS_DQTYPE_USER 0x01 /* user dquot record */ -#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ -#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ -#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ +#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */ +#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */ +#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */ +#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ @@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) #define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) +#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK)) + /* * bmbt records have a file offset (block) field that is 54 bits wide, so this * is the largest xfs_fileoff_t that we ever expect to see. diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 505533c43a92..1cfd5bc6520a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ +#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ /* * Minimum and maximum sizes need for growth checks. @@ -377,7 +378,7 @@ struct xfs_bulkstat { uint32_t bs_extsize_blks; /* extent size hint, blocks */ uint32_t bs_nlink; /* number of links */ - uint32_t bs_extents; /* number of extents */ + uint32_t bs_extents; /* 32-bit data fork extent counter */ uint32_t bs_aextents; /* attribute number of extents */ uint16_t bs_version; /* structure version */ uint16_t bs_forkoff; /* inode fork offset in bytes */ @@ -386,8 +387,9 @@ struct xfs_bulkstat { uint16_t bs_checked; /* checked inode metadata */ uint16_t bs_mode; /* type and mode */ uint16_t bs_pad2; /* zeroed */ + uint64_t bs_extents64; /* 64-bit data fork extent counter */ - uint64_t bs_pad[7]; /* zeroed */ + uint64_t bs_pad[6]; /* zeroed */ }; #define XFS_BULKSTAT_VERSION_V1 (1) @@ -459,17 +461,28 @@ struct xfs_bulk_ireq { * Only return results from the specified @agno. If @ino is zero, start * with the first inode of @agno. */ -#define XFS_BULK_IREQ_AGNO (1 << 0) +#define XFS_BULK_IREQ_AGNO (1U << 0) /* * Return bulkstat information for a single inode, where @ino value is a * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* * values below. Not compatible with XFS_BULK_IREQ_AGNO. */ -#define XFS_BULK_IREQ_SPECIAL (1 << 1) +#define XFS_BULK_IREQ_SPECIAL (1U << 1) -#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ - XFS_BULK_IREQ_SPECIAL) +/* + * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign + * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use + * xfs_bulkstat->bs_extents for returning data fork extent count and set + * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and + * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than + * XFS_MAX_EXTCNT_DATA_FORK_OLD. + */ +#define XFS_BULK_IREQ_NREXT64 (1U << 2) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL | \ + XFS_BULK_IREQ_NREXT64) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -699,34 +712,34 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ -#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) +#define XFS_SCRUB_IFLAG_REPAIR (1u << 0) /* o: Metadata object needs repair. */ -#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) +#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1) /* * o: Metadata object could be optimized. It's not corrupt, but * we could improve on it somehow. */ -#define XFS_SCRUB_OFLAG_PREEN (1 << 2) +#define XFS_SCRUB_OFLAG_PREEN (1u << 2) /* o: Cross-referencing failed. */ -#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) +#define XFS_SCRUB_OFLAG_XFAIL (1u << 3) /* o: Metadata object disagrees with cross-referenced metadata. */ -#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) +#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4) /* o: Scan was not complete. */ -#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) +#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5) /* o: Metadata object looked funny but isn't corrupt. */ -#define XFS_SCRUB_OFLAG_WARNING (1 << 6) +#define XFS_SCRUB_OFLAG_WARNING (1u << 6) /* * o: IFLAG_REPAIR was set but metadata object did not need fixing or * optimization and has therefore not been altered. */ -#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b418fe0c0679..bf2f4bc89193 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2414,9 +2414,9 @@ out_drop: */ void xfs_ialloc_log_agi( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* allocation group header buffer */ - int fields) /* bitmask of fields to log */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ @@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry( igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + if (xfs_has_large_extent_counts(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 8b5c2b709022..a7705b6a1fd3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -60,7 +60,7 @@ void xfs_ialloc_log_agi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* allocation group header buffer */ - int fields); /* bitmask of fields to log */ + uint32_t fields); /* bitmask of fields to log */ /* * Read in the allocation group header (inode allocation section) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cae9708c8587..3b1b63f9d886 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -279,6 +279,25 @@ xfs_inode_to_disk_ts( return ts; } +static inline void +xfs_inode_to_disk_iext_counters( + struct xfs_inode *ip, + struct xfs_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); + /* + * We might be upgrading the inode to use larger extent counters + * than was previously used. Hence zero the unused field. + */ + to->di_nrext64_pad = cpu_to_be16(0); + } else { + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); + } +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -296,7 +315,6 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); @@ -307,8 +325,6 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(ip->i_disk_size); to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = cpu_to_be16(ip->i_diflags); @@ -323,11 +339,14 @@ xfs_inode_to_disk( to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = cpu_to_be16(ip->i_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t @@ -336,20 +355,40 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents; + xfs_extnum_t max_extents; + mode_t mode = be16_to_cpu(dip->di_mode); + uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); + uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); + + di_nextents = xfs_dfork_nextents(dip, whichfork); + + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. + * + * For all types, check that when the size says the should be in extent + * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISDIR(mode) || S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } - switch (XFS_DFORK_FORMAT(dip, whichfork)) { + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* - * no local regular files yet + * No local regular files yet. */ - if (whichfork == XFS_DATA_FORK) { - if (S_ISREG(be16_to_cpu(dip->di_mode))) - return __this_address; - if (be64_to_cpu(dip->di_size) > - XFS_DFORK_SIZE(dip, mp, whichfork)) - return __this_address; - } + if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) + return __this_address; if (di_nextents) return __this_address; break; @@ -358,12 +397,11 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - if (whichfork == XFS_ATTR_FORK) { - if (di_nextents > MAXAEXTNUM) - return __this_address; - } else if (di_nextents > MAXEXTNUM) { + max_extents = xfs_iext_max_nextents( + xfs_dinode_has_large_extent_counts(dip), + whichfork); + if (di_nextents > max_extents) return __this_address; - } break; default: return __this_address; @@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_nrext64( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) { + if (!xfs_has_large_extent_counts(mp)) + return __this_address; + if (dip->di_nrext64_pad != 0) + return __this_address; + } else if (dip->di_version >= 3) { + if (dip->di_v3_pad != 0) + return __this_address; + } + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -407,6 +463,9 @@ xfs_dinode_verify( uint16_t flags; uint64_t flags2; uint64_t di_size; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; @@ -437,10 +496,19 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) + return fa; + + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + /* Fork checks carried over from xfs_iformat_fork */ - if (mode && - be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks)) + if (mode && nextents + naextents > nblocks) + return __this_address; + + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) @@ -497,7 +565,7 @@ xfs_dinode_verify( default: return __this_address; } - if (dip->di_anextents) + if (naextents) return __this_address; } @@ -639,7 +707,7 @@ xfs_inode_validate_extsize( if (extsize_bytes % blocksize_bytes) return __this_address; - if (extsize > MAXEXTLEN) + if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) @@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize( if (cowextsize_bytes % mp->m_sb.sb_blocksize) return __this_address; - if (cowextsize > MAXEXTLEN) + if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (cowextsize > mp->m_sb.sb_agblocks / 2) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9149f4f796fc..1a4cdf550f6d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -36,7 +36,7 @@ xfs_init_local_fork( int64_t size) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int mem_size = size, real_size = 0; + int mem_size = size; bool zero_terminate; /* @@ -50,8 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -105,7 +104,7 @@ xfs_iformat_extents( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; @@ -117,8 +116,8 @@ xfs_iformat_extents( * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); + xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", + ip->i_ino, nex); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); @@ -230,7 +229,7 @@ xfs_iformat_data_fork( * depend on it. */ ip->i_df.if_format = dip->di_format; - ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + ip->i_df.if_nextents = xfs_dfork_data_extents(dip); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -295,14 +294,14 @@ xfs_iformat_attr_fork( struct xfs_inode *ip, struct xfs_dinode *dip) { + xfs_extnum_t naextents = xfs_dfork_attr_extents(dip); int error = 0; /* * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, - be16_to_cpu(dip->di_anextents)); + ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents); switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_LOCAL: @@ -497,12 +496,7 @@ xfs_idata_realloc( return; } - /* - * For inline data, the underlying buffer must be a multiple of 4 bytes - * in size so that it can be logged and stay on word boundaries. - * We enforce that here. - */ - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } @@ -744,7 +738,8 @@ xfs_iext_count_may_overflow( if (whichfork == XFS_COW_FORK) return 0; - max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), + whichfork); if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) max_exts = 10; @@ -755,3 +750,27 @@ xfs_iext_count_may_overflow( return 0; } + +/* + * Upgrade this inode's extent counter fields to be able to handle a potential + * increase in the extent count by nr_to_add. Normally this is the same + * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. + */ +int +xfs_iext_count_upgrade( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint nr_to_add) +{ + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + + if (!xfs_has_large_extent_counts(ip->i_mount) || + xfs_inode_has_large_extent_counts(ip) || + XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + return -EFBIG; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 3d64a3acb0ed..4f68c1f20beb 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -21,9 +21,9 @@ struct xfs_ifork { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ - xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -40,19 +40,6 @@ struct xfs_ifork { #define XFS_IEXT_PUNCH_HOLE_CNT (1) /* - * Directory entry addition can cause the following, - * 1. Data block can be added/removed. - * A new extent can cause extent count to increase by 1. - * 2. Free disk block can be added/removed. - * Same behaviour as described above for Data block. - * 3. Dabtree blocks. - * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new - * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. - */ -#define XFS_IEXT_DIR_MANIP_CNT(mp) \ - ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) - -/* * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to * be added. One extra extent for dabtree in case a local attr is * large enough to cause a double split. It can also cause extent @@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } +static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + case XFS_COW_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_DATA_FORK_LARGE; + return XFS_MAX_EXTCNT_DATA_FORK_SMALL; + + case XFS_ATTR_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + return XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + + default: + ASSERT(0); + return 0; + } +} + +static inline xfs_extnum_t +xfs_dfork_data_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be64_to_cpu(dip->di_big_nextents); + + return be32_to_cpu(dip->di_nextents); +} + +static inline xfs_extnum_t +xfs_dfork_attr_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be32_to_cpu(dip->di_big_anextents); + + return be16_to_cpu(dip->di_anextents); +} + +static inline xfs_extnum_t +xfs_dfork_nextents( + struct xfs_dinode *dip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_dfork_data_extents(dip); + case XFS_ATTR_FORK: + return xfs_dfork_attr_extents(dip); + default: + ASSERT(0); + break; + } + + return 0; +} + struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); @@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); +int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, + uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b322db523d65..b351b9dc6561 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr) /* Log Clients */ #define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ @@ -114,7 +113,12 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_MAX 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_MAX 30 + /* * Flags to log operation header @@ -237,6 +241,8 @@ typedef struct xfs_trans_header { #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 +#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ +#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -252,7 +258,9 @@ typedef struct xfs_trans_header { { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ - { XFS_LI_BUD, "XFS_LI_BUD" } + { XFS_LI_BUD, "XFS_LI_BUD" }, \ + { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ + { XFS_LI_ATTRD, "XFS_LI_ATTRD" } /* * Inode Log Item Format definitions. @@ -388,16 +396,41 @@ struct xfs_log_dinode { uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ - uint8_t di_pad[6]; /* unused, zeroed space */ - uint16_t di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + uint64_t di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + uint64_t di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ + uint16_t di_flushiter; /* V2 inode incremented on flush */ + }; + }; xfs_log_timestamp_t di_atime; /* time last accessed */ xfs_log_timestamp_t di_mtime; /* time last modified */ xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + uint32_t di_nextents; + uint16_t di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + uint32_t di_big_anextents; + uint16_t di_nrext64_pad; + } __packed; + } __packed; uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ @@ -869,4 +902,44 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; +/* + * Flags for deferred attribute operations. + * Upper bits are flags, lower byte is type code + */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) + +/* + * This is the structure used to lay out an attr log item in the + * log. + */ +struct xfs_attri_log_format { + uint16_t alfi_type; /* attri log item type */ + uint16_t alfi_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfi_id; /* attri identifier */ + uint64_t alfi_ino; /* the inode for this attr operation */ + uint32_t alfi_op_flags; /* marks the op as a set or remove */ + uint32_t alfi_name_len; /* attr name length */ + uint32_t alfi_value_len; /* attr value length */ + uint32_t alfi_attr_filter;/* attr filter flags */ +}; + +struct xfs_attrd_log_format { + uint16_t alfd_type; /* attrd log item type */ + uint16_t alfd_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfd_alf_id; /* id of corresponding attri */ +}; + #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index ff69a0000817..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops; extern const struct xlog_recover_item_ops xlog_rud_item_ops; extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; +extern const struct xlog_recover_item_ops xlog_attri_item_ops; +extern const struct xlog_recover_item_ops xlog_attrd_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -108,12 +110,6 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -126,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +int xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 67798ff5e14e..9975b93a7412 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -14,6 +14,7 @@ #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" +#include "xfs_trace.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res( } /* + * Compute an alternate set of log reservation sizes for use exclusively with + * minimum log size calculations. + */ +static void +xfs_log_calc_trans_resv_for_minlogblocks( + struct xfs_mount *mp, + struct xfs_trans_resv *resv) +{ + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + + xfs_trans_resv_calc(mp, resv); + + if (xfs_has_reflink(mp)) { + /* + * In the early days of reflink, typical log operation counts + * were greatly overestimated. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + resv->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + } else if (xfs_has_rmapbt(mp)) { + /* + * In the early days of non-reflink rmap, the impact of rmapbt + * updates on log counts were not taken into account at all. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + } + + /* + * In the early days of reflink, we did not use deferred refcount + * update log items, so log reservations must be recomputed using the + * old calculations. + */ + resv->tr_write.tr_logres = + xfs_calc_write_reservation_minlogsize(mp); + resv->tr_itruncate.tr_logres = + xfs_calc_itruncate_reservation_minlogsize(mp); + resv->tr_qm_dqalloc.tr_logres = + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; +} + +/* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. @@ -46,19 +106,25 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { + struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; + unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); - resp = (struct xfs_trans_res *)M_RES(mp); - end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); - for (; resp < end_resp; resp++) { + xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); + + resp = (struct xfs_trans_res *)&resv; + end_resp = (struct xfs_trans_res *)(&resv + 1); + for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; + + trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ @@ -66,9 +132,10 @@ xfs_log_get_max_trans_res( } if (attr_space > log_space) { - *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } + trace_xfs_log_get_max_trans_res(mp, max_resp); } /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index a02c5062f9b2..cb035da3f990 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -16,7 +16,6 @@ * and quota-limits. This is a waste in the common case, but hey ... */ typedef uint64_t xfs_qcnt_t; -typedef uint16_t xfs_qwarncnt_t; typedef uint8_t xfs_dqtype_t; @@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t; /* * flags for q_flags field in the dquot. */ -#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ +#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ @@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */ +#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */ +#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be * modified. */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 +#define XFS_QMOPT_RES_REGBLKS (1u << 7) +#define XFS_QMOPT_RES_RTBLKS (1u << 8) +#define XFS_QMOPT_BCOUNT (1u << 9) +#define XFS_QMOPT_ICOUNT (1u << 10) +#define XFS_QMOPT_RTBCOUNT (1u << 11) +#define XFS_QMOPT_DELBCOUNT (1u << 12) +#define XFS_QMOPT_DELRTBCOUNT (1u << 13) +#define XFS_QMOPT_RES_INOS (1u << 14) /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_INHERIT (1u << 31) + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } /* * flags to xfs_trans_mod_dquot. @@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 327ba25e9e17..97e9e6020596 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -886,8 +886,13 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_ag.refc.shape_changes * - xfs_allocfree_log_count(cur->bc_mp, 1); + /* + * Worst case estimate: full splits of the free space and rmap btrees + * to handle each of the shape changes to the refcount btree. + */ + overhead = xfs_allocfree_block_count(cur->bc_mp, + cur->bc_ag.refc.shape_changes); + overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -960,6 +965,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ + cur->bc_ag.refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -970,7 +976,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); + cur->bc_ag.refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9eb01edbd89d..e8b322de7f3d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, * log (plus any key updates) so we'll conservatively assume 32 bytes * per record. We must also leave space for btree splits on both ends * of the range and space for the CUD and a new CUI. + * + * Each EFI that we attach to the transaction is assumed to consume ~32 bytes. + * This is a low estimate for an EFI tracking a single extent (16 bytes for the + * EFI header, 16 for the extent, and 12 for the xlog op header), but the + * estimate is acceptable if there's more than one extent being freed. + * In the worst case of freeing every other block during a refcount decrease + * operation, we amortize the space used for one EFI log item across 16 + * extents. */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) -{ - return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; -} - extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); union xfs_btree_rec; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index cd322174dbff..2845019d31da 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,18 +34,32 @@ int xfs_rmap_lookup_le( struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat) { + int get_stat = 0; + int error; + cur->bc_rec.r.rm_startblock = bno; - cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_blockcount = 0; cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + if (error || !(*stat) || !irec) + return error; + + error = xfs_rmap_get_rec(cur, irec, &get_stat); + if (error) + return error; + if (!get_stat) + return -EFSCORRUPTED; + + return 0; } /* @@ -251,7 +265,6 @@ out_bad_rec: struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; - int *stat; }; /* For each rmap given, figure out if it matches the key we want. */ @@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper( * return a match with the same owner and adjacent physical and logical * block ranges. */ -int +STATIC int xfs_rmap_find_left_neighbor( struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; *stat = 0; @@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor( info.high.rm_flags = flags; info.high.rm_blockcount = 0; info.irec = irec; - info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_find_left_neighbor_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * extent conversion and remap operations run a bit faster if the + * physical extents aren't being shared. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* For each rmap given, figure out if it matches the key we want. */ @@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; info.high.rm_startblock = bno; @@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range( info.high.rm_blockcount = 0; *stat = 0; info.irec = irec; - info.stat = stat; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_lookup_le_range_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, + bno, 0, owner, offset, flags); + + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * scrub run much faster on most filesystems because bmbt records are + * usually an exact match for rmap records. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_lookup_le_range_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* @@ -510,7 +570,7 @@ xfs_rmap_unmap( * for the AG headers at rm_startblock == 0 created by mkfs/growfs that * will not ever be removed from the tree. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -518,13 +578,6 @@ xfs_rmap_unmap( goto out_error; } - error = xfs_rmap_get_rec(cur, <rec, &i); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -786,18 +839,11 @@ xfs_rmap_map( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &have_lt); if (error) goto out_error; if (have_lt) { - error = xfs_rmap_get_rec(cur, <rec, &have_lt); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, have_lt != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -1022,7 +1068,7 @@ xfs_rmap_convert( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1030,13 +1076,6 @@ xfs_rmap_convert( goto done; } - error = xfs_rmap_get_rec(cur, &PREV, &i); - if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, @@ -1140,7 +1179,7 @@ xfs_rmap_convert( _RET_IP_); /* reset the cursor back to PREV */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2677,7 +2716,7 @@ xfs_rmap_record_exists( ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || (flags & XFS_RMAP_BMBT_BLOCK)); - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, &has_record); if (error) return error; @@ -2686,14 +2725,6 @@ xfs_rmap_record_exists( return 0; } - error = xfs_rmap_get_rec(cur, &irec, &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; - } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b718ebeda372..54741a591a17 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, - unsigned int flags, int *stat); + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, int *stat); @@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); -int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, - struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5740ba664867..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1008,6 +1008,7 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, @@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range( void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; xfs_rtblock_t high_key; @@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } @@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..a20cade590e9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -31,15 +31,66 @@ */ /* + * Check that all the V4 feature bits that the V5 filesystem format requires are + * correctly set. + */ +static bool +xfs_sb_validate_v5_features( + struct xfs_sb *sbp) +{ + /* We must not have any unknown V4 feature bits set */ + if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) + return false; + + /* + * The CRC bit is considered an invalid V4 flag, so we have to add it + * manually to the OKBITS mask. + */ + if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | + XFS_SB_VERSION2_CRCBIT)) + return false; + + /* Now check all the required V4 feature flags are set. */ + +#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_MOREBITSBIT) + +#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_CRCBIT) + + if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) + return false; + if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) + return false; + return true; +} + +/* * We support all XFS versions newer than a v4 superblock with V2 directories. */ bool xfs_sb_good_version( struct xfs_sb *sbp) { - /* all v5 filesystems are supported */ + /* + * All v5 filesystems are supported, but we must check that all the + * required v4 feature flags are enabled correctly as the code checks + * those flags and not for v5 support. + */ if (xfs_sb_is_v5(sbp)) - return true; + return xfs_sb_validate_v5_features(sbp); + + /* We must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) @@ -51,12 +102,6 @@ xfs_sb_good_version( if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; - /* And must not have any unknown v4 feature bits set */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - /* It's a supported v4 filesystem */ return true; } @@ -70,6 +115,8 @@ xfs_sb_version_to_features( /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) + features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) @@ -124,6 +171,9 @@ xfs_sb_version_to_features( features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) + features |= XFS_FEAT_NREXT64; + return features; } @@ -262,12 +312,15 @@ xfs_validate_sb_common( bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, +"Superblock has bad magic number 0x%x. Not an XFS filesystem?", + be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); + xfs_warn(mp, +"Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } @@ -911,6 +964,11 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); @@ -1135,6 +1193,8 @@ xfs_fs_geometry( } else { geo->logsectsize = BBSIZE; } + if (xfs_has_large_extent_counts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 25c4cab58851..c4381388c0c1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, /* * Values for t_flags. */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ -#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ +/* Transaction needs to be logged */ +#define XFS_TRANS_DIRTY (1u << 0) +/* Superblock is dirty and needs to be logged */ +#define XFS_TRANS_SB_DIRTY (1u << 1) +/* Transaction took a permanent log reservation */ +#define XFS_TRANS_PERM_LOG_RES (1u << 2) +/* Synchronous transaction commit needed */ +#define XFS_TRANS_SYNC (1u << 3) +/* Transaction can use reserve block pool */ +#define XFS_TRANS_RESERVE (1u << 4) +/* Transaction should avoid VFS level superblock write accounting */ +#define XFS_TRANS_NO_WRITECOUNT (1u << 5) +/* Transaction has freed blocks returned to it's reservation */ +#define XFS_TRANS_RES_FDBLKS (1u << 6) +/* Transaction contains an intent done log item */ +#define XFS_TRANS_HAS_INTENT_DONE (1u << 7) + /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f0b38f4aba80..8b9bd178a487 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -213,7 +213,7 @@ xfs_symlink_shortform_verify( /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6f83d9b306ee..e9913c2c5a24 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -56,15 +56,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). With reflink, there are four trees (refcountbt). The number of - * blocks reserved is based on the formula: + * (rmapbt). The number of blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ uint -xfs_allocfree_log_count( +xfs_allocfree_block_count( struct xfs_mount *mp, uint num_ops) { @@ -73,13 +72,24 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_has_reflink(mp)) - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } /* + * Per-extent log reservation for refcount btree changes. These are never done + * in the same transaction as an allocation or a free, so we compute them + * separately. + */ +static unsigned int +xfs_refcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_refc_maxlevels - 1); +} + +/* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into * the amount of space they use on disk. @@ -136,7 +146,7 @@ xfs_calc_inobt_res( { return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res( { uint res, size = 0; - res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ @@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res( /* * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap - * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, - * as well as the realtime summary block. + * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime + * extents, as well as the realtime summary block. */ static unsigned int -xfs_rtalloc_log_count( +xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int rtbmp_bytes; - rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; } @@ -233,6 +243,28 @@ xfs_rtalloc_log_count( * register overflow from temporaries in the calculations. */ +/* + * Compute the log reservation required to handle the refcount update + * transaction. Refcount updates are always done via deferred log items. + * + * This is calculated as: + * Data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +static unsigned int +xfs_calc_refcountbt_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); +} /* * In a write transaction we can allocate a maximum of 2 @@ -247,7 +279,7 @@ xfs_rtalloc_log_count( * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size - * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 1 block * the allocation btrees: 2 trees * (2 * max depth - 1) * block size * And the bmap_finish transaction can free bmap blocks in a join (t3): @@ -255,34 +287,65 @@ xfs_rtalloc_log_count( * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_write_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz); } else { t2 = 0; } t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * two refcountbt splits for each transaction. The codebase runs + * refcountbt updates in separate transactions now, so to compute the + * minimum log size, add the refcountbtree splits back to t1 and t3 and + * do not account them separately as t4. Reflink did not support + * realtime when the reservations were established, so no adjustment to + * t2 is needed. + */ + if (for_minlogsize) { + unsigned int adj = 0; + + if (xfs_has_reflink(mp)) + adj = xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 2), + blksz); + t1 += adj; + t3 += adj; + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 1); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_write_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp, true); } /* @@ -299,33 +362,62 @@ xfs_calc_write_reservation( * the agf for each of the ags: 2 * sector size * the agfl for each of the ags: 2 * sector size * the super block to reflect the freed blocks: sector size - * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_itruncate_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); } else { t3 = 0; } - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * four refcountbt splits in the same transaction as bnobt/cntbt + * updates. The codebase runs refcountbt updates in separate + * transactions now, so to compute the minimum log size, add the + * refcount btree splits back here and do not compute them separately + * as t4. Reflink did not support realtime when the reservations were + * established, so do not adjust t3. + */ + if (for_minlogsize) { + if (xfs_has_reflink(mp)) + t2 += xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 4), + blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 2); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_itruncate_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_itruncate_reservation(mp, true); } /* @@ -349,7 +441,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -389,7 +481,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -427,7 +519,7 @@ xfs_calc_remove_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -572,7 +664,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -670,7 +762,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void) */ STATIC uint xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - return xfs_calc_write_reservation(mp) + + return xfs_calc_write_reservation(mp, for_minlogsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } +unsigned int +xfs_calc_qm_dqalloc_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_qm_dqalloc_reservation(mp, true); +} + /* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size @@ -814,36 +914,18 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; - - /* - * In the early days of rmap+reflink, we always set the rmap maxlevels - * to 9 even if the AG was small enough that it would never grow to - * that height. Transaction reservation sizes influence the minimum - * log size calculation, which influences the size of the log that mkfs - * creates. Use the old value here to ensure that newly formatted - * small filesystems will mount on older kernels. - */ - if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) - mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + int logcount_adj = 0; /* * The following transactions are logged in physical format and * require a permanent reservation on space. */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_itruncate.tr_logcount = - XFS_ITRUNCATE_LOG_COUNT_REFLINK; - else - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -899,11 +981,9 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, + false); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* @@ -930,6 +1010,19 @@ xfs_trans_resv_calc( resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); - /* Put everything back the way it was. This goes at the end. */ - mp->m_rmap_maxlevels = rmap_maxlevels; + /* + * Add one logcount for BUI items that appear with rmap or reflink, + * one logcount for refcount intent items, and one logcount for rmap + * intent items. + */ + if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) + logcount_adj++; + if (xfs_has_reflink(mp)) + logcount_adj++; + if (xfs_has_rmapbt(mp)) + logcount_adj++; + + resp->tr_itruncate.tr_logcount += logcount_adj; + resp->tr_write.tr_logcount += logcount_adj; + resp->tr_qm_dqalloc.tr_logcount += logcount_adj; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index fc4e9b369a3a..0554b9d775d2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,7 +73,6 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -83,13 +82,24 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 +/* + * Original log operation counts were overestimated in the early days of + * reflink. These are retained here purely for minimum log size calculations + * and must not be used for runtime reservations. + */ +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 + void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); -uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); + +unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b6da06b40989..373f64a492a4 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ -typedef int32_t xfs_extnum_t; /* # of extents in a file */ -typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef uint64_t xfs_extnum_t; /* # of extents in a file */ +typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ @@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t; #define NULLAGINO ((xfs_agino_t)-1) /* - * Max values for extlen, extnum, aextnum. - */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ - -/* * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index a4cbbc346f60..285995ba3947 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -133,29 +133,13 @@ xchk_bmap_get_rmap( if (info->is_shared) { error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, owner, offset, rflags, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - goto out; + } else { + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); } - - /* - * Otherwise, use the (faster) regular lookup. - */ - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner, - offset, rflags, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) + if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) return false; - if (!has_rmap) - goto out; - error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - -out: if (!has_rmap) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -350,7 +334,7 @@ xchk_bmap_iextent( irec->br_startoff); /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > MAXEXTLEN) + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (info->is_rt && diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index bf1f3607d0b6..97b54ac3075f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -23,6 +23,8 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index eac15af7b08c..51820b40ab1c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -232,7 +232,8 @@ xchk_dinode( size_t fork_recs; unsigned long long isize; uint64_t flags2; - uint32_t nextents; + xfs_extnum_t nextents; + xfs_extnum_t naextents; prid_t prid; uint16_t flags; uint16_t mode; @@ -390,8 +391,10 @@ xchk_dinode( xchk_inode_extsize(sc, dip, ino, mode, flags); + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + /* di_nextents */ - nextents = be32_to_cpu(dip->di_nextents); fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -411,7 +414,7 @@ xchk_dinode( /* di_forkoff */ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); - if (dip->di_anextents != 0 && dip->di_forkoff == 0) + if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) xchk_ino_set_corrupt(sc, ino); @@ -423,19 +426,18 @@ xchk_dinode( xchk_ino_set_corrupt(sc, ino); /* di_anextents */ - nextents = be16_to_cpu(dip->di_anextents); fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_aformat) { case XFS_DINODE_FMT_EXTENTS: - if (nextents > fork_recs) + if (naextents > fork_recs) xchk_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: - if (nextents <= fork_recs) + if (naextents <= fork_recs) xchk_ino_set_corrupt(sc, ino); break; default: - if (nextents != 0) + if (naextents != 0) xchk_ino_set_corrupt(sc, ino); } @@ -513,14 +515,14 @@ xchk_inode_xref_bmap( &nextents, &count); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents < be32_to_cpu(dip->di_nextents)) + if (nextents < xfs_dfork_data_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, &nextents, &acount); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents != be16_to_cpu(dip->di_anextents)) + if (nextents != xfs_dfork_attr_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); /* Check nblocks against the inode. */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 8fa012057405..0a3bde64c675 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -40,6 +40,7 @@ xchk_setup_rt( /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -48,10 +49,10 @@ xchk_rtbitmap_rec( xfs_rtblock_t startblock; xfs_rtblock_t blockcount; - startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; - if (!xfs_verify_rtext(sc->mp, startblock, blockcount)) + if (!xfs_verify_rtext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -114,7 +115,7 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index b11870d07c56..2e8e400f10a9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -340,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, }; -/* This isn't a stable feature, warn once per day. */ -static inline void -xchk_experimental_warning( - struct xfs_mount *mp) -{ - static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( - "xchk_warning", 86400 * HZ, 1); - ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&scrub_warning)) - xfs_alert(mp, -"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); -} - static int xchk_validate_inputs( struct xfs_mount *mp, @@ -478,7 +464,8 @@ xfs_scrub_metadata( if (error) goto out; - xchk_experimental_warning(mp); + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, + "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); if (!sc) { diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 5c52ee869272..b744c62052b6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -10,13 +10,14 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_trans.h" +#include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> @@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) xfs_acl_to_disk(args.value, acl); } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); kmem_free(args.value); /* diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index bb6abdcb265d..263404d0bfda 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else -static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu) +#define xfs_get_acl NULL +#define xfs_set_acl NULL +static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - return NULL; + return 0; } -# define xfs_set_acl NULL static inline void xfs_forget_acl(struct inode *inode, const char *name) { } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c new file mode 100644 index 000000000000..135d44133477 --- /dev/null +++ b/fs/xfs/xfs_attr_item.c @@ -0,0 +1,885 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson <allison.henderson@oracle.com> + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_shared.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_item.h" +#include "xfs_trace.h" +#include "xfs_trans_space.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; + +static const struct xfs_item_ops xfs_attri_item_ops; +static const struct xfs_item_ops xfs_attrd_item_ops; +static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip); + +static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attri_log_item, attri_item); +} + +/* + * Shared xattr name/value buffers for logged extended attribute operations + * + * When logging updates to extended attributes, we can create quite a few + * attribute log intent items for a single xattr update. To avoid cycling the + * memory allocator and memcpy overhead, the name (and value, for setxattr) + * are kept in a refcounted object that is shared across all related log items + * and the upper-level deferred work state structure. The shared buffer has + * a control structure, followed by the name, and then the value. + */ + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_get( + struct xfs_attri_log_nameval *nv) +{ + if (!refcount_inc_not_zero(&nv->refcount)) + return NULL; + return nv; +} + +static inline void +xfs_attri_log_nameval_put( + struct xfs_attri_log_nameval *nv) +{ + if (!nv) + return; + if (refcount_dec_and_test(&nv->refcount)) + kvfree(nv); +} + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_alloc( + const void *name, + unsigned int name_len, + const void *value, + unsigned int value_len) +{ + struct xfs_attri_log_nameval *nv; + + /* + * This could be over 64kB in length, so we have to use kvmalloc() for + * this. But kvmalloc() utterly sucks, so we use our own version. + */ + nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + + name_len + value_len); + if (!nv) + return nv; + + nv->name.i_addr = nv + 1; + nv->name.i_len = name_len; + nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; + memcpy(nv->name.i_addr, name, name_len); + + if (value_len) { + nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_len = value_len; + memcpy(nv->value.i_addr, value, value_len); + } else { + nv->value.i_addr = NULL; + nv->value.i_len = 0; + } + nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + + refcount_set(&nv->refcount, 1); + return nv; +} + +STATIC void +xfs_attri_item_free( + struct xfs_attri_log_item *attrip) +{ + kmem_free(attrip->attri_item.li_lv_shadow); + xfs_attri_log_nameval_put(attrip->attri_nameval); + kmem_cache_free(xfs_attri_cache, attrip); +} + +/* + * Freeing the attrip requires that we remove it from the AIL if it has already + * been placed there. However, the ATTRI may not yet have been placed in the + * AIL when called by xfs_attri_release() from ATTRD processing due to the + * ordering of committed vs unpin operations in bulk insert operations. Hence + * the reference count to ensure only the last caller frees the ATTRI. + */ +STATIC void +xfs_attri_release( + struct xfs_attri_log_item *attrip) +{ + ASSERT(atomic_read(&attrip->attri_refcount) > 0); + if (!atomic_dec_and_test(&attrip->attri_refcount)) + return; + + xfs_trans_ail_delete(&attrip->attri_item, 0); + xfs_attri_item_free(attrip); +} + +STATIC void +xfs_attri_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + + *nvecs += 2; + *nbytes += sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(nv->name.i_len); + + if (!nv->value.i_len) + return; + + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->value.i_len); +} + +/* + * This is called to fill in the log iovecs for the given attri log + * item. We use 1 iovec for the attri_format_item, 1 for the name, and + * another for the value if it is present + */ +STATIC void +xfs_attri_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + + attrip->attri_format.alfi_type = XFS_LI_ATTRI; + attrip->attri_format.alfi_size = 1; + + /* + * This size accounting must be done before copying the attrip into the + * iovec. If we do it after, the wrong size will be recorded to the log + * and we trip across assertion checks for bad region sizes later during + * the log recovery. + */ + + ASSERT(nv->name.i_len > 0); + attrip->attri_format.alfi_size++; + + if (nv->value.i_len > 0) + attrip->attri_format.alfi_size++; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, + &attrip->attri_format, + sizeof(struct xfs_attri_log_format)); + xlog_copy_from_iovec(lv, &vecp, &nv->name); + if (nv->value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->value); +} + +/* + * The unpin operation is the last place an ATTRI is manipulated in the log. It + * is either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the ATTRI transaction has been successfully committed to make + * it this far. Therefore, we expect whoever committed the ATTRI to either + * construct and commit the ATTRD or drop the ATTRD's reference in the event of + * error. Simply drop the log's ATTRI reference now that the log is done with + * it. + */ +STATIC void +xfs_attri_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + + +STATIC void +xfs_attri_item_release( + struct xfs_log_item *lip) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + +/* + * Allocate and initialize an attri item. Caller may allocate an additional + * trailing buffer for name and value + */ +STATIC struct xfs_attri_log_item * +xfs_attri_init( + struct xfs_mount *mp, + struct xfs_attri_log_nameval *nv) +{ + struct xfs_attri_log_item *attrip; + + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); + + /* + * Grab an extra reference to the name/value buffer for this log item. + * The caller retains its own reference! + */ + attrip->attri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attrip->attri_nameval); + + xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, + &xfs_attri_item_ops); + attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip; + atomic_set(&attrip->attri_refcount, 2); + + return attrip; +} + +/* + * Copy an attr format buffer from the given buf, and into the destination attr + * format structure. + */ +STATIC int +xfs_attri_copy_format( + struct xfs_log_iovec *buf, + struct xfs_attri_log_format *dst_attr_fmt) +{ + struct xfs_attri_log_format *src_attr_fmt = buf->i_addr; + size_t len; + + len = sizeof(struct xfs_attri_log_format); + if (buf->i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; + } + + memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len); + return 0; +} + +static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attrd_log_item, attrd_item); +} + +STATIC void +xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) +{ + kmem_free(attrdp->attrd_item.li_lv_shadow); + kmem_cache_free(xfs_attrd_cache, attrdp); +} + +STATIC void +xfs_attrd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_attrd_log_format); +} + +/* + * This is called to fill in the log iovecs for the given attrd log item. We use + * only 1 iovec for the attrd_format, and we point that at the attr_log_format + * structure embedded in the attrd item. + */ +STATIC void +xfs_attrd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + attrdp->attrd_format.alfd_type = XFS_LI_ATTRD; + attrdp->attrd_format.alfd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT, + &attrdp->attrd_format, + sizeof(struct xfs_attrd_log_format)); +} + +/* + * The ATTRD is either committed or aborted if the transaction is canceled. If + * the transaction is canceled, drop our reference to the ATTRI and free the + * ATTRD. + */ +STATIC void +xfs_attrd_item_release( + struct xfs_log_item *lip) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + + xfs_attri_release(attrdp->attrd_attrip); + xfs_attrd_item_free(attrdp); +} + +static struct xfs_log_item * +xfs_attrd_item_intent( + struct xfs_log_item *lip) +{ + return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; +} + +/* + * Performs one step of an attribute update intent and marks the attrd item + * dirty.. An attr operation may be a set or a remove. Note that the + * transaction is marked dirty regardless of whether the operation succeeds or + * fails to support the ATTRI/ATTRD lifecycle rules. + */ +STATIC int +xfs_xattri_finish_update( + struct xfs_attr_intent *attr, + struct xfs_attrd_log_item *attrdp) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + error = -EAGAIN; +out: + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the ATTRI and frees the ATTRD + * 2.) shuts down the filesystem + */ + args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; + + /* + * attr intent/done items are null when logged attributes are disabled + */ + if (attrdp) + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + return error; +} + +/* Log an attr to the intent item. */ +STATIC void +xfs_attr_log_item( + struct xfs_trans *tp, + struct xfs_attri_log_item *attrip, + const struct xfs_attr_intent *attr) +{ + struct xfs_attri_log_format *attrp; + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); + + /* + * At this point the xfs_attr_intent has been constructed, and we've + * created the log intent. Fill in the attri log item and log format + * structure with fields from this xfs_attr_intent + */ + attrp = &attrip->attri_format; + attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); + attrp->alfi_op_flags = attr->xattri_op_flags; + attrp->alfi_value_len = attr->xattri_nameval->value.i_len; + attrp->alfi_name_len = attr->xattri_nameval->name.i_len; + ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; +} + +/* Get an ATTRI. */ +static struct xfs_log_item * +xfs_attr_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_attri_log_item *attrip; + struct xfs_attr_intent *attr; + struct xfs_da_args *args; + + ASSERT(count == 1); + + /* + * Each attr item only performs one attribute operation at a time, so + * this is a list of one + */ + attr = list_first_entry_or_null(items, struct xfs_attr_intent, + xattri_list); + args = attr->xattri_da_args; + + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + return NULL; + + /* + * Create a buffer to store the attribute name and value. This buffer + * will be shared between the higher level deferred xattr work state + * and the lower level xattr log items. + */ + if (!attr->xattri_nameval) { + /* + * Transfer our reference to the name/value buffer to the + * deferred work state structure. + */ + attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, + args->namelen, args->value, args->valuelen); + } + if (!attr->xattri_nameval) + return ERR_PTR(-ENOMEM); + + attrip = xfs_attri_init(mp, attr->xattri_nameval); + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); + + return &attrip->attri_item; +} + +static inline void +xfs_attr_free_item( + struct xfs_attr_intent *attr) +{ + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + xfs_attri_log_nameval_put(attr->xattri_nameval); + if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) + kmem_free(attr); + else + kmem_cache_free(xfs_attr_intent_cache, attr); +} + +/* Process an attr. */ +STATIC int +xfs_attr_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_attr_intent *attr; + struct xfs_attrd_log_item *done_item = NULL; + int error; + + attr = container_of(item, struct xfs_attr_intent, xattri_list); + if (done) + done_item = ATTRD_ITEM(done); + + /* + * Always reset trans after EAGAIN cycle + * since the transaction is new + */ + attr->xattri_da_args->trans = tp; + + error = xfs_xattri_finish_update(attr, done_item); + if (error != -EAGAIN) + xfs_attr_free_item(attr); + + return error; +} + +/* Abort all pending ATTRs. */ +STATIC void +xfs_attr_abort_intent( + struct xfs_log_item *intent) +{ + xfs_attri_release(ATTRI_ITEM(intent)); +} + +/* Cancel an attr */ +STATIC void +xfs_attr_cancel_item( + struct list_head *item) +{ + struct xfs_attr_intent *attr; + + attr = container_of(item, struct xfs_attr_intent, xattri_list); + xfs_attr_free_item(attr); +} + +STATIC bool +xfs_attri_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; +} + +/* Is this recovered ATTRI format ok? */ +static inline bool +xfs_attri_validate( + struct xfs_mount *mp, + struct xfs_attri_log_format *attrp) +{ + unsigned int op = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + + if (attrp->__pad != 0) + return false; + + if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) + return false; + + if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) + return false; + + /* alfi_op_flags should be either a set or remove */ + switch (op) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + break; + default: + return false; + } + + if (attrp->alfi_value_len > XATTR_SIZE_MAX) + return false; + + if ((attrp->alfi_name_len > XATTR_NAME_MAX) || + (attrp->alfi_name_len == 0)) + return false; + + return xfs_verify_ino(mp, attrp->alfi_ino); +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attri_item_recover( + struct xfs_log_item *lip, + struct list_head *capture_list) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res tres; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error, ret = 0; + int total; + int local; + struct xfs_attrd_log_item *done_item = NULL; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + if (error) + return error; + + attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + + sizeof(struct xfs_da_args), KM_NOFS); + args = (struct xfs_da_args *)(attr + 1); + + attr->xattri_da_args = args; + attr->xattri_op_flags = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + + /* + * We're reconstructing the deferred work state structure from the + * recovered log item. Grab a reference to the name/value buffer and + * attach it to the new work state. + */ + attr->xattri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attr->xattri_nameval); + + args->dp = ip; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->name = nv->name.i_addr; + args->namelen = nv->name.i_len; + args->hashval = xfs_da_hashname(args->name, args->namelen); + args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | + XFS_DA_OP_LOGGED; + + ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); + + switch (attr->xattri_op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; + args->total = xfs_attr_calc_size(args, &local); + if (xfs_inode_hasattr(args->dp)) + attr->xattri_dela_state = xfs_attr_init_replace_state(args); + else + attr->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + if (!xfs_inode_hasattr(args->dp)) + goto out; + attr->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + goto out; + } + + xfs_init_attr_trans(args, &tres, &total); + error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + if (error) + goto out; + + args->trans = tp; + done_item = xfs_trans_get_attrd(tp, attrip); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + ret = xfs_xattri_finish_update(attr, done_item); + if (ret == -EAGAIN) { + /* There's more work to do, so add it to this transaction */ + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); + } else + error = ret; + + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + +out_unlock: + if (attr->xattri_leaf_bp) + xfs_buf_relse(attr->xattri_leaf_bp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); +out: + if (ret != -EAGAIN) + xfs_attr_free_item(attr); + return error; +} + +/* Re-log an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_attri_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_attrd_log_item *attrdp; + struct xfs_attri_log_item *old_attrip; + struct xfs_attri_log_item *new_attrip; + struct xfs_attri_log_format *new_attrp; + struct xfs_attri_log_format *old_attrp; + + old_attrip = ATTRI_ITEM(intent); + old_attrp = &old_attrip->attri_format; + + tp->t_flags |= XFS_TRANS_DIRTY; + attrdp = xfs_trans_get_attrd(tp, old_attrip); + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + /* + * Create a new log item that shares the same name/value buffer as the + * old log item. + */ + new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval); + new_attrp = &new_attrip->attri_format; + + new_attrp->alfi_ino = old_attrp->alfi_ino; + new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; + new_attrp->alfi_value_len = old_attrp->alfi_value_len; + new_attrp->alfi_name_len = old_attrp->alfi_name_len; + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; + + xfs_trans_add_item(tp, &new_attrip->attri_item); + set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); + + return &new_attrip->attri_item; +} + +STATIC int +xlog_recover_attri_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_attri_log_item *attrip; + struct xfs_attri_log_format *attri_formatp; + struct xfs_attri_log_nameval *nv; + const void *attr_value = NULL; + const void *attr_name; + int error; + + attri_formatp = item->ri_buf[0].i_addr; + attr_name = item->ri_buf[1].i_addr; + + /* Validate xfs_attri_log_format before the large memory allocation */ + if (!xfs_attri_validate(mp, attri_formatp)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (attri_formatp->alfi_value_len) + attr_value = item->ri_buf[2].i_addr; + + /* + * Memory alloc failure will cause replay to abort. We attach the + * name/value buffer to the recovered incore log item and drop our + * reference. + */ + nv = xfs_attri_log_nameval_alloc(attr_name, + attri_formatp->alfi_name_len, attr_value, + attri_formatp->alfi_value_len); + if (!nv) + return -ENOMEM; + + attrip = xfs_attri_init(mp, nv); + error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); + if (error) + goto out; + + /* + * The ATTRI has two references. One for the ATTRD and one for ATTRI to + * ensure it makes it into the AIL. Insert the ATTRI into the AIL + * directly and drop the ATTRI reference. Note that + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); + xfs_attri_release(attrip); + xfs_attri_log_nameval_put(nv); + return 0; +out: + xfs_attri_item_free(attrip); + xfs_attri_log_nameval_put(nv); + return error; +} + +/* + * This routine is called to allocate an "attr free done" log item. + */ +static struct xfs_attrd_log_item * +xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip) +{ + struct xfs_attrd_log_item *attrdp; + + ASSERT(tp != NULL); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + xfs_trans_add_item(tp, &attrdp->attrd_item); + return attrdp; +} + +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + if (!intent) + return NULL; + + return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, +}; + +/* + * This routine is called when an ATTRD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding ATTRI if + * it was still in the log. To do this it searches the AIL for the ATTRI with + * an id equal to that in the ATTRD format structure. If we find it we drop + * the ATTRD reference, which removes the ATTRI from the AIL and frees it. + */ +STATIC int +xlog_recover_attrd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_attrd_log_format *attrd_formatp; + + attrd_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_ATTRI, + attrd_formatp->alfd_alf_id); + return 0; +} + +static const struct xfs_item_ops xfs_attri_item_ops = { + .flags = XFS_ITEM_INTENT, + .iop_size = xfs_attri_item_size, + .iop_format = xfs_attri_item_format, + .iop_unpin = xfs_attri_item_unpin, + .iop_release = xfs_attri_item_release, + .iop_recover = xfs_attri_item_recover, + .iop_match = xfs_attri_item_match, + .iop_relog = xfs_attri_item_relog, +}; + +const struct xlog_recover_item_ops xlog_attri_item_ops = { + .item_type = XFS_LI_ATTRI, + .commit_pass2 = xlog_recover_attri_commit_pass2, +}; + +static const struct xfs_item_ops xfs_attrd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, + .iop_size = xfs_attrd_item_size, + .iop_format = xfs_attrd_item_format, + .iop_release = xfs_attrd_item_release, + .iop_intent = xfs_attrd_item_intent, +}; + +const struct xlog_recover_item_ops xlog_attrd_item_ops = { + .item_type = XFS_LI_ATTRD, + .commit_pass2 = xlog_recover_attrd_commit_pass2, +}; diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h new file mode 100644 index 000000000000..3280a7930287 --- /dev/null +++ b/fs/xfs/xfs_attr_item.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson <allison.henderson@oracle.com> + */ +#ifndef __XFS_ATTR_ITEM_H__ +#define __XFS_ATTR_ITEM_H__ + +/* kernel only ATTRI/ATTRD definitions */ + +struct xfs_mount; +struct kmem_zone; + +struct xfs_attri_log_nameval { + struct xfs_log_iovec name; + struct xfs_log_iovec value; + refcount_t refcount; + + /* name and value follow the end of this struct */ +}; + +/* + * This is the "attr intention" log item. It is used to log the fact that some + * extended attribute operations need to be processed. An operation is + * currently either a set or remove. Set or remove operations are described by + * the xfs_attr_intent which may be logged to this intent. + * + * During a normal attr operation, name and value point to the name and value + * fields of the caller's xfs_da_args structure. During a recovery, the name + * and value buffers are copied from the log, and stored in a trailing buffer + * attached to the xfs_attr_intent until they are committed. They are freed + * when the xfs_attr_intent itself is freed when the work is done. + */ +struct xfs_attri_log_item { + struct xfs_log_item attri_item; + atomic_t attri_refcount; + struct xfs_attri_log_nameval *attri_nameval; + struct xfs_attri_log_format attri_format; +}; + +/* + * This is the "attr done" log item. It is used to log the fact that some attrs + * earlier mentioned in an attri item have been freed. + */ +struct xfs_attrd_log_item { + struct xfs_log_item attrd_item; + struct xfs_attri_log_item *attrd_attrip; + struct xfs_attrd_log_format attrd_format; +}; + +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + +#endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 2d1e5134cebe..90a14e85e76d 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 761dde155099..51f66e982484 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -39,6 +39,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { + kmem_free(buip->bui_item.li_lv_shadow); kmem_cache_free(xfs_bui_cache, buip); } @@ -54,10 +55,11 @@ xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } + if (!atomic_dec_and_test(&buip->bui_refcount)) + return; + + xfs_trans_ail_delete(&buip->bui_item, 0); + xfs_bui_item_free(buip); } @@ -198,14 +200,24 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); + kmem_free(budp->bud_item.li_lv_shadow); kmem_cache_free(xfs_bud_cache, budp); } +static struct xfs_log_item * +xfs_bud_item_intent( + struct xfs_log_item *lip) +{ + return &BUD_ITEM(lip)->bud_buip->bui_item; +} + static const struct xfs_item_ops xfs_bud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_bud_item_size, .iop_format = xfs_bud_item_format, .iop_release = xfs_bud_item_release, + .iop_intent = xfs_bud_item_intent, }; static struct xfs_bud_log_item * @@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update( * 1.) releases the BUI and frees the BUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; @@ -506,6 +518,8 @@ xfs_bui_item_recover( iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; @@ -584,6 +598,7 @@ xfs_bui_item_relog( } static const struct xfs_item_ops xfs_bui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index eb2e387ba528..52be58372c63 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -119,14 +119,14 @@ retry: */ ralen = ap->length / mp->m_sb.sb_rextsize; /* - * If the old value was close enough to MAXEXTLEN that + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that * we rounded up to it, cut it back so it's valid again. * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) + ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; /* * Lock out modifications to both the RT bitmap and summary inodes @@ -839,9 +839,11 @@ xfs_alloc_file_space( * count, hence we need to limit the number of blocks we are * trying to reserve to avoid an overflow. We can't allocate * more than @nimaps extents, and an extent is limited on disk - * to MAXEXTLEN (21 bits), so use that to enforce the limit. + * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the + * limit. */ - resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + resblks = min_t(xfs_fileoff_t, (e - s), + (XFS_MAX_BMBT_EXTLEN * nimaps)); if (unlikely(rt)) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; @@ -857,6 +859,9 @@ xfs_alloc_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto error; @@ -912,6 +917,8 @@ xfs_unmap_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1193,6 +1200,8 @@ xfs_insert_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } @@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e11e9ef2338f..4d8a6aece995 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -8,15 +8,18 @@ /* kernel only definitions */ +struct xfs_buf; +struct xfs_mount; + /* buf log item flags */ -#define XFS_BLI_HOLD 0x01 -#define XFS_BLI_DIRTY 0x02 -#define XFS_BLI_STALE 0x04 -#define XFS_BLI_LOGGED 0x08 -#define XFS_BLI_INODE_ALLOC_BUF 0x10 -#define XFS_BLI_STALE_INODE 0x20 -#define XFS_BLI_INODE_BUF 0x40 -#define XFS_BLI_ORDERED 0x80 +#define XFS_BLI_HOLD (1u << 0) +#define XFS_BLI_DIRTY (1u << 1) +#define XFS_BLI_STALE (1u << 2) +#define XFS_BLI_LOGGED (1u << 3) +#define XFS_BLI_INODE_ALLOC_BUF (1u << 4) +#define XFS_BLI_STALE_INODE (1u << 5) +#define XFS_BLI_INODE_BUF (1u << 6) +#define XFS_BLI_ORDERED (1u << 7) #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -28,11 +31,6 @@ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ { XFS_BLI_ORDERED, "ORDERED" } - -struct xfs_buf; -struct xfs_mount; -struct xfs_buf_log_item; - /* * This is the in core log item structure used to track information * needed to log buffers. It tracks how many times the lock has been diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index e484251dc9c8..ffa94102094d 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -24,6 +24,15 @@ #include "xfs_quota.h" /* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +/* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. */ @@ -993,3 +1002,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +int +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + void *p; + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + int i; + + if (!log->l_buf_cancel_table) + return; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5afedcbc78c7..5a6c3c3c4de2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer( res->timer = xfs_dquot_set_timeout(mp, ktime_get_real_seconds() + qlim->time); } else { - if (res->timer == 0) - res->warnings = 0; - else - res->timer = 0; + res->timer = 0; } } @@ -322,6 +319,9 @@ xfs_dquot_disk_alloc( error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, quotip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; @@ -589,10 +589,6 @@ xfs_dquot_from_disk( dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); - dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); - dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); - dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); @@ -634,9 +630,9 @@ xfs_dquot_to_disk( ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); - ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); - ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); - ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + ddqp->d_bwarns = 0; + ddqp->d_iwarns = 0; + ddqp->d_rtbwarns = 0; ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 6b5e3cf40c8b..80c8f851a2f3 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -44,14 +44,6 @@ struct xfs_dquot_res { * in seconds since the Unix epoch. */ time64_t timer; - - /* - * For root dquots, this is the maximum number of warnings that will - * be issued for this quota type. Otherwise, this is the number of - * warnings issued against this quota. Note that none of this is - * implemented. - */ - xfs_qwarncnt_t warnings; }; static inline bool diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 749fd18c4f32..296faa41d81d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REDUCE_MAX_IEXTENTS, XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, + XFS_RANDOM_LARP, + XFS_RANDOM_DA_LEAF_SPLIT, + XFS_RANDOM_ATTR_LEAF_TO_NODE, }; struct xfs_errortag_attr { @@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); +XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); +XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); +XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), + XFS_ERRORTAG_ATTR_LIST(larp), + XFS_ERRORTAG_ATTR_LIST(da_leaf_split), + XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5735d5ea87ee..5191e9145e55 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); * XFS panic tags -- allow a call to xfs_alert_tag() be turned into * a panic by setting xfs_panic_mask in a sysctl. */ -#define XFS_NO_PTAG 0 -#define XFS_PTAG_IFLUSH 0x00000001 -#define XFS_PTAG_LOGRES 0x00000002 -#define XFS_PTAG_AILDELETE 0x00000004 -#define XFS_PTAG_ERROR_REPORT 0x00000008 -#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 -#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 -#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 -#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 -#define XFS_PTAG_VERIFIER_ERROR 0x00000100 +#define XFS_NO_PTAG 0u +#define XFS_PTAG_IFLUSH (1u << 0) +#define XFS_PTAG_LOGRES (1u << 1) +#define XFS_PTAG_AILDELETE (1u << 2) +#define XFS_PTAG_ERROR_REPORT (1u << 3) +#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4) +#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5) +#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6) +#define XFS_PTAG_FSBLOCK_ZERO (1u << 7) +#define XFS_PTAG_VERIFIER_ERROR (1u << 8) #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0e50f2c9348e..765be054dffe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -58,10 +58,11 @@ xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } + if (!atomic_dec_and_test(&efip->efi_refcount)) + return; + + xfs_trans_ail_delete(&efip->efi_item, 0); + xfs_efi_item_free(efip); } /* @@ -306,11 +307,20 @@ xfs_efd_item_release( xfs_efd_item_free(efdp); } +static struct xfs_log_item * +xfs_efd_item_intent( + struct xfs_log_item *lip) +{ + return &EFD_ITEM(lip)->efd_efip->efi_item; +} + static const struct xfs_item_ops xfs_efd_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_release = xfs_efd_item_release, + .iop_intent = xfs_efd_item_intent, }; /* @@ -380,7 +390,7 @@ xfs_trans_free_extent( * 1.) releases the EFI and frees the EFD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; @@ -688,6 +698,7 @@ xfs_efi_item_relog( } static const struct xfs_item_ops xfs_efi_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 85c412107a10..5a171c0b244b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -310,7 +310,7 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - int *iolock) + unsigned int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -513,7 +513,7 @@ xfs_file_dio_write_aligned( struct kiocb *iocb, struct iov_iter *from) { - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; ret = xfs_ilock_iocb(iocb, iolock); @@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned( { size_t isize = i_size_read(VFS_I(ip)); size_t count = iov_iter_count(from); - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; ssize_t ret; @@ -576,9 +576,9 @@ xfs_file_dio_write_unaligned( * don't even bother trying the fast path in this case. */ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { -retry_exclusive: if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; +retry_exclusive: iolock = XFS_IOLOCK_EXCL; flags = IOMAP_DIO_FORCE_WAIT; } @@ -655,7 +655,7 @@ xfs_file_dax_write( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; + unsigned int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; loff_t pos; @@ -694,13 +694,11 @@ xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; bool cleared_space = false; - int iolock; + unsigned int iolock; if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; @@ -767,9 +765,7 @@ xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); @@ -1167,12 +1163,10 @@ xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - return 0; + return generic_file_open(inode, file); } STATIC int @@ -1181,7 +1175,7 @@ xfs_dir_open( struct file *file) { struct xfs_inode *ip = XFS_I(inode); - int mode; + unsigned int mode; int error; error = xfs_file_open(inode, file); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 6a3ce0f6dc9e..be9bcf8a1f99 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -128,11 +128,12 @@ xfs_filestream_pick_ag( if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); if (err) { - xfs_perag_put(pag); - if (err != -EAGAIN) + if (err != -EAGAIN) { + xfs_perag_put(pag); return err; + } /* Couldn't lock the AGF, skip this AG. */ - continue; + goto next_ag; } } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10e1cb71439e..bb23199f65c3 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -450,11 +450,11 @@ xfs_getfsmap_logdev( /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; @@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; - error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; @@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( info->last = true; ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); - error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); if (error) goto err; err: diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 68f74549fa22..d4a77c53f94b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -149,12 +149,7 @@ xfs_growfs_data_private( error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, &lastag_extended); } else { - static struct ratelimit_state shrink_warning = \ - RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); - ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&shrink_warning)) - xfs_alert(mp, + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); @@ -349,10 +344,7 @@ xfs_fs_counts( cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); - - spin_lock(&mp->m_sb_lock); - cnt->freertx = mp->m_sb.sb_frextents; - spin_unlock(&mp->m_sb_lock); + cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); } /* @@ -512,7 +504,7 @@ xfs_fs_goingdown( void xfs_do_force_shutdown( struct xfs_mount *mp, - int flags, + uint32_t flags, char *fname, int lnnum) { diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f62fa652c2fd..4d0a98f920ca 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = { #endif #ifdef DEBUG .pwork_threads = -1, /* automatic thread detection */ + .larp = false, /* log attribute replay */ #endif }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bffd6eb0b298..5269354b1b69 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - uint64_t freertx; if (!XFS_IS_REALTIME_INODE(ip)) return false; - freertx = READ_ONCE(mp->m_sb.sb_frextents); - return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; + if (__percpu_counter_compare(&mp->m_frextents, + mp->m_low_rtexts[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + return false; } #else # define xfs_inodegc_want_queue_rt_file(ip) (false) diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 508e184e3b8f..b05314d48176 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -63,6 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { + kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 39ae53efb3ab..52d6f2c7d58b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok( * parent locking. Care must be taken to ensure we don't overrun the subclass * storage fields in the class mask we build. */ -static inline int -xfs_lock_inumorder(int lock_mode, int subclass) +static inline uint +xfs_lock_inumorder( + uint lock_mode, + uint subclass) { - int class = 0; + uint class = 0; ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | XFS_ILOCK_RTSUM))); @@ -464,7 +466,10 @@ xfs_lock_inodes( int inodes, uint lock_mode) { - int attempts = 0, i, j, try_lock; + int attempts = 0; + uint i; + int j; + bool try_lock; struct xfs_log_item *lp; /* @@ -489,9 +494,9 @@ xfs_lock_inodes( } else if (lock_mode & XFS_MMAPLOCK_EXCL) ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); - try_lock = 0; - i = 0; again: + try_lock = false; + i = 0; for (; i < inodes; i++) { ASSERT(ips[i]); @@ -506,7 +511,7 @@ again: for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = &ips[j]->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) - try_lock++; + try_lock = true; } } @@ -546,8 +551,6 @@ again: if ((attempts % 5) == 0) { delay(1); /* Don't just spin the CPU */ } - i = 0; - try_lock = 0; goto again; } } @@ -1024,11 +1027,6 @@ xfs_create( xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry @@ -1242,11 +1240,6 @@ xfs_link( if (error) goto std_return; - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto error_return; - /* * If we are using project inheritance, we only allow hard link * creation in our tree when the project IDs are the same; else @@ -2629,7 +2622,7 @@ xfs_ifree( */ error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - return error; + goto out; error = xfs_iunlink_remove(tp, pag, ip); if (error) @@ -3212,35 +3205,6 @@ retry: /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. - * - * Extent count overflow check: - * - * From the perspective of src_dp, a rename operation is essentially a - * directory entry remove operation. Hence the only place where we check - * for extent count overflow for src_dp is in - * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns - * -ENOSPC when it detects a possible extent count overflow and in - * response, the higher layers of directory handling code do the - * following: - * 1. Data/Free blocks: XFS lets these blocks linger until a - * future remove operation removes them. - * 2. Dabtree blocks: XFS swaps the blocks with the last block in the - * Leaf space and unmaps the last block. - * - * For target_dp, there are two cases depending on whether the - * destination directory entry exists or not. - * - * When destination directory entry does not exist (i.e. target_ip == - * NULL), extent count overflow check is performed only when transaction - * has a non-zero sized space reservation associated with it. With a - * zero-sized space reservation, XFS allows a rename operation to - * continue only when the directory has sufficient free space in its - * data/leaf/free space blocks to hold the new entry. - * - * When destination directory entry exists (i.e. target_ip != NULL), all - * we need to do is change the inode number associated with the already - * existing entry. Hence there is no need to perform an extent count - * overflow check. */ if (target_ip == NULL) { /* @@ -3251,12 +3215,6 @@ retry: error = xfs_dir_canenter(tp, target_dp, target_name); if (error) goto out_trans_cancel; - } else { - error = xfs_iext_count_may_overflow(target_dp, - XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; } } else { /* @@ -3424,18 +3382,12 @@ retry: * inode number of the whiteout inode rather than removing it * altogether. */ - if (wip) { + if (wip) error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); - } else { - /* - * NOTE: We don't need to check for extent count overflow here - * because the dir remove name code will leave the dir block in - * place if the extent count would overflow. - */ + else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); - } if (error) goto out_trans_cancel; @@ -3517,8 +3469,8 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: detected corrupt incore inode %Lu, " - "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, + "%s: detected corrupt incore inode %llu, " + "total extents = %llu nblocks = %lld, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_nblocks, ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 740ab13d1aa2..7be6f8e705ab 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } +static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -278,12 +283,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ -#define XFS_IOLOCK_EXCL (1<<0) -#define XFS_IOLOCK_SHARED (1<<1) -#define XFS_ILOCK_EXCL (1<<2) -#define XFS_ILOCK_SHARED (1<<3) -#define XFS_MMAPLOCK_EXCL (1<<4) -#define XFS_MMAPLOCK_SHARED (1<<5) +#define XFS_IOLOCK_EXCL (1u << 0) +#define XFS_IOLOCK_SHARED (1u << 1) +#define XFS_ILOCK_EXCL (1u << 2) +#define XFS_ILOCK_SHARED (1u << 3) +#define XFS_MMAPLOCK_EXCL (1u << 4) +#define XFS_MMAPLOCK_SHARED (1u << 5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ @@ -350,19 +355,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) */ #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_MAX_SUBCLASS 3 -#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000u #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 #define XFS_MMAPLOCK_MAX_SUBCLASS 3 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u #define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) -#define XFS_ILOCK_RTBITMAP_VAL 6 -#define XFS_ILOCK_RTSUM_VAL 7 -#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_RTBITMAP_VAL 6u +#define XFS_ILOCK_RTSUM_VAL 7u +#define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9e6ef55cf29e..721def0639fd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - *nbytes += roundup(ip->i_df.if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes); *nvecs += 1; } break; @@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes); *nvecs += 1; } break; @@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, data_bytes); - ilf->ilf_dsize = (unsigned)data_bytes; + ip->i_df.if_u1.if_data, + ip->i_df.if_bytes); + ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; @@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, - data_bytes); - ilf->ilf_asize = (unsigned)data_bytes; + ip->i_afp->if_bytes); + ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; @@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode( } } +static inline void +xfs_inode_to_log_dinode_iext_counters( + struct xfs_inode *ip, + struct xfs_log_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_nrext64_pad = 0; + } else { + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(ip->i_afp); + } +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -374,7 +378,6 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); @@ -386,8 +389,6 @@ xfs_inode_to_log_dinode( to->di_size = ip->i_disk_size; to->di_nblocks = ip->i_nblocks; to->di_extsize = ip->i_extsize; - to->di_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = ip->i_diflags; @@ -407,11 +408,14 @@ xfs_inode_to_log_dinode( to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_log_dinode_iext_counters(ip, to); } /* diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 239dd2e3384e..d28ffaebd067 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts( return ts; } +static inline bool xfs_log_dinode_has_large_extent_counts( + const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_NREXT64); +} + +static inline void +xfs_log_dinode_to_disk_iext_counters( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + if (xfs_log_dinode_has_large_extent_counts(from)) { + to->di_big_nextents = cpu_to_be64(from->di_big_nextents); + to->di_big_anextents = cpu_to_be32(from->di_big_anextents); + to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); + } else { + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + } + +} + STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, @@ -158,7 +181,6 @@ xfs_log_dinode_to_disk( to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); @@ -167,8 +189,6 @@ xfs_log_dinode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); to->di_forkoff = from->di_forkoff; to->di_aformat = from->di_aformat; to->di_dmevmask = cpu_to_be32(from->di_dmevmask); @@ -184,12 +204,66 @@ xfs_log_dinode_to_disk( to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_flushiter = cpu_to_be16(from->di_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_log_dinode_to_disk_iext_counters(from, to); +} + +STATIC int +xlog_dinode_verify_extent_counts( + struct xfs_mount *mp, + struct xfs_log_dinode *ldip) +{ + xfs_extnum_t nextents; + xfs_aextnum_t anextents; + + if (xfs_log_dinode_has_large_extent_counts(ldip)) { + if (!xfs_has_large_extent_counts(mp) || + (ldip->di_nrext64_pad != 0)) { + XFS_CORRUPTION_ERROR( + "Bad log dinode large extent count format", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, padding 0x%x", + ldip->di_ino, xfs_has_large_extent_counts(mp), + ldip->di_nrext64_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_big_nextents; + anextents = ldip->di_big_anextents; + } else { + if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { + XFS_CORRUPTION_ERROR( + "Bad log dinode di_v3_pad", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, di_v3_pad 0x%llx", + ldip->di_ino, ldip->di_v3_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_nextents; + anextents = ldip->di_anextents; + } + + if (unlikely(nextents + anextents > ldip->di_nblocks)) { + XFS_CORRUPTION_ERROR("Bad log dinode extent counts", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", + ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, + anextents, ldip->di_nblocks); + return -EFSCORRUPTED; + } + + return 0; } STATIC int @@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2( if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for regular file", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } @@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2( if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE) && (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for directory", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; + + error = xlog_dinode_verify_extent_counts(mp, ldip); + if (error) goto out_release; - } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode fork offset", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + "Bad inode 0x%llx, di_forkoff 0x%x", + in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; } isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, + mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); + "Bad inode 0x%llx log dinode size 0x%x", + in_f->ilf_ino, item->ri_buf[1].i_len); error = -EFSCORRUPTED; goto out_release; } @@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2( ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); + (len == xlog_calc_iovec_len(in_f->ilf_dsize))); switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: @@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2( } len = item->ri_buf[attr_index].i_len; src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); + ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { case XFS_ILOG_ADATA: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 83481005317a..0d67ff8a8961 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -15,6 +15,8 @@ #include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -35,8 +37,7 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" +#include "xfs_xattr.h" #include <linux/mount.h> #include <linux/namei.h> @@ -524,7 +525,7 @@ xfs_attrmulti_attr_set( args.valuelen = len; } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (flags & XFS_IOC_ATTR_ROOT)) xfs_forget_acl(inode, name); kfree(args.value); @@ -813,6 +814,9 @@ xfs_bulk_ireq_setup( if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) return -ECANCELED; + if (hdr->flags & XFS_BULK_IREQ_NREXT64) + breq->flags |= XFS_IBULK_NREXT64; + return 0; } @@ -1092,7 +1096,8 @@ xfs_flags2diflags2( { uint64_t di_flags2 = (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | - XFS_DIFLAG2_BIGTIME)); + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index ca25ed89b706..2f54b701eead 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -17,6 +17,8 @@ #include "xfs_itable.h" #include "xfs_fsops.h" #include "xfs_rtalloc.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_ioctl.h" #include "xfs_ioctl32.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e552ce541ec2..5a393259a3a3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -251,6 +251,8 @@ xfs_iomap_write_direct( return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, nr_exts); if (error) goto out_trans_cancel; @@ -402,7 +404,7 @@ xfs_iomap_prealloc_size( */ plen = prev.br_blockcount; while (xfs_iext_prev_extent(ifp, &ncur, &got)) { - if (plen > MAXEXTLEN / 2 || + if (plen > XFS_MAX_BMBT_EXTLEN / 2 || isnullstartblock(got.br_startblock) || got.br_startoff + got.br_blockcount != prev.br_startoff || got.br_startblock + got.br_blockcount != prev.br_startblock) @@ -414,23 +416,23 @@ xfs_iomap_prealloc_size( /* * If the size of the extents is greater than half the maximum extent * length, then use the current offset as the basis. This ensures that - * for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width - * alignment of real extents. + * for large files the preallocation size always extends to + * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe + * unit/width alignment of real extents. */ alloc_blocks = plen * 2; - if (alloc_blocks > MAXEXTLEN) + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); qblocks = alloc_blocks; /* - * MAXEXTLEN is not a power of two value but we round the prealloc down - * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported - * prealloc size, we round up first, apply appropriate throttling, - * round down and cap the value to MAXEXTLEN. + * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc + * down to the nearest power of two value after throttling. To prevent + * the round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, round + * down and cap the value to XFS_BMBT_MAX_EXTLEN. */ - alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); freesp = percpu_counter_read_positive(&mp->m_fdblocks); @@ -478,14 +480,14 @@ xfs_iomap_prealloc_size( */ if (alloc_blocks) alloc_blocks = rounddown_pow_of_two(alloc_blocks); - if (alloc_blocks > MAXEXTLEN) - alloc_blocks = MAXEXTLEN; + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) + alloc_blocks = XFS_MAX_BMBT_EXTLEN; /* * If we are still trying to allocate more space than is * available, squash the prealloc hard. This can happen if we * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. + * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN. */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; @@ -555,6 +557,9 @@ xfs_iomap_write_unwritten( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b34e8e4344a8..29f5b8b8aca6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -13,6 +13,8 @@ #include "xfs_inode.h" #include "xfs_acl.h" #include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" #include "xfs_trace.h" @@ -22,6 +24,7 @@ #include "xfs_iomap.h" #include "xfs_error.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -59,7 +62,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (error < 0) break; } @@ -209,7 +212,6 @@ xfs_generic_create( if (unlikely(error)) goto out_cleanup_inode; -#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) @@ -220,7 +222,6 @@ xfs_generic_create( if (error) goto out_cleanup_inode; } -#endif xfs_setup_iops(ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c08c79d9e311..f74c9fff72bb 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -64,6 +64,7 @@ xfs_bulkstat_one_int( struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; struct xfs_bulkstat *buf = bc->buf; + xfs_extnum_t nextents; int error = -EINVAL; if (xfs_internal_inum(mp, ino)) @@ -102,7 +103,13 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = ip->i_extsize; - buf->bs_extents = xfs_ifork_nextents(&ip->i_df); + + nextents = xfs_ifork_nextents(&ip->i_df); + if (!(bc->breq->flags & XFS_IBULK_NREXT64)) + buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL); + else + buf->bs_extents64 = nextents; + xfs_bulkstat_health(ip, buf); buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); buf->bs_forkoff = XFS_IFORK_BOFF(ip); @@ -256,6 +263,7 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; + unsigned int iwalk_flags = 0; int error; if (breq->mnt_userns != &init_user_ns) { @@ -279,7 +287,10 @@ xfs_bulkstat( if (error) goto out; - error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + if (breq->flags & XFS_IBULK_SAME_AG) + iwalk_flags |= XFS_IWALK_SAME_AG; + + error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); out: diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 7078d10c9b12..e2d0eba43f35 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -17,7 +17,10 @@ struct xfs_ibulk { }; /* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) +#define XFS_IBULK_SAME_AG (1U << 0) + +/* Fill out the bs_extents64 field if set. */ +#define XFS_IBULK_NREXT64 (1U << 1) /* * Advance the user buffer pointer by one record of the given size. If the diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 37a795f03267..83699089755e 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, unsigned int inode_records, bool poll, void *data); /* Only iterate inodes within the same AG as @startino. */ -#define XFS_IWALK_SAME_AG (0x1) +#define XFS_IWALK_SAME_AG (1U << 0) #define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 499e15b24215..1e972f884a81 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -49,7 +49,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclog, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp); STATIC void xlog_grant_push_ail( @@ -61,10 +60,6 @@ xlog_sync( struct xlog_in_core *iclog); #if defined(DEBUG) STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr); -STATIC void xlog_verify_grant_tail( struct xlog *log); STATIC void @@ -77,7 +72,6 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog); #else -#define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b) @@ -90,6 +84,62 @@ xlog_iclogs_empty( static int xfs_log_cover(struct xfs_mount *); +/* + * We need to make sure the buffer pointer returned is naturally aligned for the + * biggest basic data type we put into it. We have already accounted for this + * padding when sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + * + * We also add space for the xlog_op_header that describes this region in the + * log. This prepends the data region we return to the caller to copy their data + * into, so do all the static initialisation of the ophdr now. Because the ophdr + * is not 8 byte aligned, we have to be careful to ensure that we align the + * start of the buffer such that the region we return to the call is 8 byte + * aligned and packed against the tail of the ophdr. + */ +void * +xlog_prepare_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + struct xlog_op_header *oph; + uint32_t len; + void *buf; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + len = lv->lv_buf_len + sizeof(struct xlog_op_header); + if (!IS_ALIGNED(len, sizeof(uint64_t))) { + lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + sizeof(struct xlog_op_header); + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + oph = vec->i_addr; + oph->oh_clientid = XFS_TRANSACTION; + oph->oh_res2 = 0; + oph->oh_flags = 0; + + buf = vec->i_addr + sizeof(struct xlog_op_header); + ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t))); + + *vecp = vec; + return buf; +} + static void xlog_grant_sub_space( struct xlog *log, @@ -322,30 +372,6 @@ xlog_grant_head_check( return error; } -static void -xlog_tic_reset_res(xlog_ticket_t *tic) -{ - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - tic->t_res_num_ophdrs = 0; -} - -static void -xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) -{ - if (tic->t_res_num == XLOG_TIC_LEN_MAX) { - /* add to overflow and start again */ - tic->t_res_o_flow += tic->t_res_arr_sum; - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - } - - tic->t_res_arr[tic->t_res_num].r_len = len; - tic->t_res_arr[tic->t_res_num].r_type = type; - tic->t_res_arr_sum += len; - tic->t_res_num++; -} - bool xfs_log_writable( struct xfs_mount *mp) @@ -395,8 +421,6 @@ xfs_log_regrant( xlog_grant_push_ail(log, tic->t_unit_res); tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - if (tic->t_cnt > 0) return 0; @@ -434,10 +458,9 @@ out_error: int xfs_log_reserve( struct xfs_mount *mp, - int unit_bytes, - int cnt, + int unit_bytes, + int cnt, struct xlog_ticket **ticp, - uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -445,15 +468,13 @@ xfs_log_reserve( int need_bytes; int error = 0; - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -901,12 +922,22 @@ xlog_write_unmount_record( struct xlog *log, struct xlog_ticket *ticket) { - struct xfs_unmount_log_format ulf = { - .magic = XLOG_UNMOUNT_TYPE, + struct { + struct xlog_op_header ophdr; + struct xfs_unmount_log_format ulf; + } unmount_rec = { + .ophdr = { + .oh_clientid = XFS_LOG, + .oh_tid = cpu_to_be32(ticket->t_tid), + .oh_flags = XLOG_UNMOUNT_TRANS, + }, + .ulf = { + .magic = XLOG_UNMOUNT_TYPE, + }, }; struct xfs_log_iovec reg = { - .i_addr = &ulf, - .i_len = sizeof(ulf), + .i_addr = &unmount_rec, + .i_len = sizeof(unmount_rec), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { @@ -914,10 +945,14 @@ xlog_write_unmount_record( .lv_iovecp = ®, }; + BUILD_BUG_ON((sizeof(struct xlog_op_header) + + sizeof(struct xfs_unmount_log_format)) != + sizeof(unmount_rec)); + /* account for space used by record data */ - ticket->t_curr_res -= sizeof(ulf); + ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &vec, ticket, reg.i_len); } /* @@ -933,7 +968,7 @@ xlog_unmount_write( struct xlog_ticket *tic = NULL; int error; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, 0); if (error) goto out_err; @@ -1584,9 +1619,6 @@ xlog_alloc_log( GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; -#ifdef DEBUG - log->l_iclog_bak[i] = &iclog->ic_header; -#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -1602,7 +1634,7 @@ xlog_alloc_log( iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -2111,63 +2143,11 @@ xlog_print_tic_res( struct xfs_mount *mp, struct xlog_ticket *ticket) { - uint i; - uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); - - /* match with XLOG_REG_TYPE_* in xfs_log.h */ -#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[] = { - REG_TYPE_STR(BFORMAT, "bformat"), - REG_TYPE_STR(BCHUNK, "bchunk"), - REG_TYPE_STR(EFI_FORMAT, "efi_format"), - REG_TYPE_STR(EFD_FORMAT, "efd_format"), - REG_TYPE_STR(IFORMAT, "iformat"), - REG_TYPE_STR(ICORE, "icore"), - REG_TYPE_STR(IEXT, "iext"), - REG_TYPE_STR(IBROOT, "ibroot"), - REG_TYPE_STR(ILOCAL, "ilocal"), - REG_TYPE_STR(IATTR_EXT, "iattr_ext"), - REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), - REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), - REG_TYPE_STR(QFORMAT, "qformat"), - REG_TYPE_STR(DQUOT, "dquot"), - REG_TYPE_STR(QUOTAOFF, "quotaoff"), - REG_TYPE_STR(LRHEADER, "LR header"), - REG_TYPE_STR(UNMOUNT, "unmount"), - REG_TYPE_STR(COMMIT, "commit"), - REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create"), - REG_TYPE_STR(RUI_FORMAT, "rui_format"), - REG_TYPE_STR(RUD_FORMAT, "rud_format"), - REG_TYPE_STR(CUI_FORMAT, "cui_format"), - REG_TYPE_STR(CUD_FORMAT, "cud_format"), - REG_TYPE_STR(BUI_FORMAT, "bui_format"), - REG_TYPE_STR(BUD_FORMAT, "bud_format"), - }; - BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); -#undef REG_TYPE_STR - xfs_warn(mp, "ticket reservation summary:"); - xfs_warn(mp, " unit res = %d bytes", - ticket->t_unit_res); - xfs_warn(mp, " current res = %d bytes", - ticket->t_curr_res); - xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", - ticket->t_res_arr_sum, ticket->t_res_o_flow); - xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", - ticket->t_res_num_ophdrs, ophdr_spc); - xfs_warn(mp, " ophdr + reg = %u bytes", - ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); - xfs_warn(mp, " num regions = %u", - ticket->t_res_num); - - for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes", i, - ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? - "bad-rtype" : res_type_str[r_type]), - ticket->t_res_arr[i].r_len); - } + xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res); + xfs_warn(mp, " original count = %d", ticket->t_ocnt); + xfs_warn(mp, " remaining count = %d", ticket->t_cnt); } /* @@ -2220,187 +2200,226 @@ xlog_print_trans( } } +static inline void +xlog_write_iovec( + struct xlog_in_core *iclog, + uint32_t *log_offset, + void *data, + uint32_t write_len, + int *bytes_left, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + ASSERT(*log_offset < iclog->ic_log->l_iclog_size); + ASSERT(*log_offset % sizeof(int32_t) == 0); + ASSERT(write_len % sizeof(int32_t) == 0); + + memcpy(iclog->ic_datap + *log_offset, data, write_len); + *log_offset += write_len; + *bytes_left -= write_len; + (*record_cnt)++; + *data_cnt += write_len; +} + /* - * Calculate the potential space needed by the log vector. We may need a start - * record, and each region gets its own struct xlog_op_header and may need to be - * double word aligned. + * Write log vectors into a single iclog which is guaranteed by the caller + * to have enough space to write the entire log vector into. */ -static int -xlog_write_calc_vec_length( +static void +xlog_write_full( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector, - uint optype) + struct xlog_in_core *iclog, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - struct xfs_log_vec *lv; - int headers = 0; - int len = 0; - int i; + int index; - if (optype & XLOG_START_TRANS) - headers++; + ASSERT(*log_offset + *len <= iclog->ic_size || + iclog->ic_state == XLOG_STATE_WANT_SYNC); - for (lv = log_vector; lv; lv = lv->lv_next) { - /* we don't write ordered log vectors */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) - continue; - - headers += lv->lv_niovecs; - - for (i = 0; i < lv->lv_niovecs; i++) { - struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + /* + * Ordered log vectors have no regions to write so this + * loop will naturally skip them. + */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + struct xlog_op_header *ophdr = reg->i_addr; - len += vecp->i_len; - xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); - } + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + xlog_write_iovec(iclog, log_offset, reg->i_addr, + reg->i_len, len, record_cnt, data_cnt); } - - ticket->t_res_num_ophdrs += headers; - len += headers * sizeof(struct xlog_op_header); - - return len; -} - -static void -xlog_write_start_rec( - struct xlog_op_header *ophdr, - struct xlog_ticket *ticket) -{ - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_len = 0; - ophdr->oh_flags = XLOG_START_TRANS; - ophdr->oh_res2 = 0; } -static xlog_op_header_t * -xlog_write_setup_ophdr( - struct xlog *log, - struct xlog_op_header *ophdr, +static int +xlog_write_get_more_iclog_space( struct xlog_ticket *ticket, - uint flags) + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_res2 = 0; - - /* are we copying a commit or unmount record? */ - ophdr->oh_flags = flags; + struct xlog_in_core *iclog = *iclogp; + struct xlog *log = iclog->ic_log; + int error; - /* - * We've seen logs corrupted with bad transaction client ids. This - * makes sure that XFS doesn't generate them on. Turn this into an EIO - * and shut down the filesystem. - */ - switch (ophdr->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, - ophdr->oh_clientid, ticket); - return NULL; - } + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + if (error) + return error; - return ophdr; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + log_offset); + if (error) + return error; + *record_cnt = 0; + *data_cnt = 0; + *iclogp = iclog; + return 0; } /* - * Set up the parameters of the region copy into the log. This has - * to handle region write split across multiple log buffers - this - * state is kept external to this function so that this code can - * be written in an obvious, self documenting manner. + * Write log vectors into a single iclog which is smaller than the current chain + * length. We write until we cannot fit a full record into the remaining space + * and then stop. We return the log vector that is to be written that cannot + * wholly fit in the iclog. */ static int -xlog_write_setup_copy( +xlog_write_partial( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xlog_op_header *ophdr, - int space_available, - int space_required, - int *copy_off, - int *copy_len, - int *last_was_partial_copy, - int *bytes_consumed) -{ - int still_to_copy; - - still_to_copy = space_required - *bytes_consumed; - *copy_off = *bytes_consumed; - - if (still_to_copy <= space_available) { - /* write of region completes here */ - *copy_len = still_to_copy; - ophdr->oh_len = cpu_to_be32(*copy_len); - if (*last_was_partial_copy) - ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - *last_was_partial_copy = 0; - *bytes_consumed = 0; - return 0; - } + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + struct xlog_in_core *iclog = *iclogp; + struct xlog_op_header *ophdr; + int index = 0; + uint32_t rlen; + int error; + + /* walk the logvec, copying until we run out of space in the iclog */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + uint32_t reg_offset = 0; + + /* + * The first region of a continuation must have a non-zero + * length otherwise log recovery will just skip over it and + * start recovering from the next opheader it finds. Because we + * mark the next opheader as a continuation, recovery will then + * incorrectly add the continuation to the previous region and + * that breaks stuff. + * + * Hence if there isn't space for region data after the + * opheader, then we need to start afresh with a new iclog. + */ + if (iclog->ic_size - *log_offset <= + sizeof(struct xlog_op_header)) { + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, *len, record_cnt, + data_cnt); + if (error) + return error; + } - /* partial write of region, needs extra log op header reservation */ - *copy_len = space_available; - ophdr->oh_len = cpu_to_be32(*copy_len); - ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (*last_was_partial_copy) - ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; - *bytes_consumed += *copy_len; - (*last_was_partial_copy)++; + ophdr = reg->i_addr; + rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset); - /* account for new log op header */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); - ticket->t_res_num_ophdrs++; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header)); + if (rlen != reg->i_len) + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - return sizeof(struct xlog_op_header); -} + xlog_write_iovec(iclog, log_offset, reg->i_addr, + rlen, len, record_cnt, data_cnt); -static int -xlog_write_copy_finish( - struct xlog *log, - struct xlog_in_core *iclog, - uint flags, - int *record_cnt, - int *data_cnt, - int *partial_copy, - int *partial_copy_len, - int log_offset) -{ - int error; + /* If we wrote the whole region, move to the next. */ + if (rlen == reg->i_len) + continue; - if (*partial_copy) { /* - * This iclog has already been marked WANT_SYNC by - * xlog_state_get_iclog_space. + * We now have a partially written iovec, but it can span + * multiple iclogs so we loop here. First we release the iclog + * we currently have, then we get a new iclog and add a new + * opheader. Then we continue copying from where we were until + * we either complete the iovec or fill the iclog. If we + * complete the iovec, then we increment the index and go right + * back to the top of the outer loop. if we fill the iclog, we + * run the inner loop again. + * + * This is complicated by the tail of a region using all the + * space in an iclog and hence requiring us to release the iclog + * and get a new one before returning to the outer loop. We must + * always guarantee that we exit this inner loop with at least + * space for log transaction opheaders left in the current + * iclog, hence we cannot just terminate the loop at the end + * of the of the continuation. So we loop while there is no + * space left in the current iclog, and check for the end of the + * continuation after getting a new iclog. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - goto release_iclog; - } + do { + /* + * Ensure we include the continuation opheader in the + * space we need in the new iclog by adding that size + * to the length we require. This continuation opheader + * needs to be accounted to the ticket as the space it + * consumes hasn't been accounted to the lv we are + * writing. + */ + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, + *len + sizeof(struct xlog_op_header), + record_cnt, data_cnt); + if (error) + return error; - *partial_copy = 0; - *partial_copy_len = 0; + ophdr = iclog->ic_datap + *log_offset; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = XFS_TRANSACTION; + ophdr->oh_res2 = 0; + ophdr->oh_flags = XLOG_WAS_CONT_TRANS; - if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) - return 0; + ticket->t_curr_res -= sizeof(struct xlog_op_header); + *log_offset += sizeof(struct xlog_op_header); + *data_cnt += sizeof(struct xlog_op_header); - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; + /* + * If rlen fits in the iclog, then end the region + * continuation. Otherwise we're going around again. + */ + reg_offset += rlen; + rlen = reg->i_len - reg_offset; + if (rlen <= iclog->ic_size - *log_offset) + ophdr->oh_flags |= XLOG_END_TRANS; + else + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - xlog_is_shutdown(log)); -release_iclog: - error = xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - return error; + rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset); + ophdr->oh_len = cpu_to_be32(rlen); + + xlog_write_iovec(iclog, log_offset, + reg->i_addr + reg_offset, + rlen, len, record_cnt, data_cnt); + + } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS); + } + + /* + * No more iovecs remain in this logvec so return the next log vec to + * the caller so it can go back to fast path copying. + */ + *iclogp = iclog; + return 0; } /* @@ -2449,27 +2468,16 @@ xlog_write( struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - uint optype) + uint32_t len) + { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; - struct xfs_log_iovec *vecp = lv->lv_iovecp; - int index = 0; - int len; - int partial_copy = 0; - int partial_copy_len = 0; - int contwr = 0; - int record_cnt = 0; - int data_cnt = 0; + uint32_t record_cnt = 0; + uint32_t data_cnt = 0; int error = 0; + int log_offset; - /* - * If this is a commit or unmount transaction, we don't need a start - * record to be written. We do, however, have to account for the - * commit or unmount header that gets written. Hence we always have - * to account for an extra xlog_op_header here. - */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2477,144 +2485,54 @@ xlog_write( xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - len = xlog_write_calc_vec_length(ticket, log_vector, optype); - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - void *ptr; - int log_offset; - - error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset); - if (error) - return error; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &log_offset); + if (error) + return error; - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = iclog->ic_datap + log_offset; + ASSERT(log_offset <= iclog->ic_size - 1); - /* - * If we have a context pointer, pass it the first iclog we are - * writing to so it can record state needed for iclog write - * ordering. - */ - if (ctx) { - xlog_cil_set_ctx_write_state(ctx, iclog); - ctx = NULL; - } + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) + xlog_cil_set_ctx_write_state(ctx, iclog); + while (lv) { /* - * This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). + * If the entire log vec does not fit in the iclog, punt it to + * the partial copy loop which can handle this case. */ - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - struct xfs_log_iovec *reg; - struct xlog_op_header *ophdr; - int copy_len; - int copy_off; - bool ordered = false; - bool wrote_start_rec = false; - - /* ordered log vectors have no regions to write */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { - ASSERT(lv->lv_niovecs == 0); - ordered = true; - goto next_lv; - } - - reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - - /* - * Before we start formatting log vectors, we need to - * write a start record. Only do this for the first - * iclog we write to. - */ - if (optype & XLOG_START_TRANS) { - xlog_write_start_rec(ptr, ticket); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - optype &= ~XLOG_START_TRANS; - wrote_start_rec = true; - } - - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); - if (!ophdr) - return -EIO; - - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - - len += xlog_write_setup_copy(ticket, ophdr, - iclog->ic_size-log_offset, - reg->i_len, - ©_off, ©_len, - &partial_copy, - &partial_copy_len); - xlog_verify_dest_ptr(log, ptr); - - /* - * Copy region. - * - * Unmount records just log an opheader, so can have - * empty payloads with no data region to copy. Hence we - * only copy the payload if the vector says it has data - * to copy. - */ - ASSERT(copy_len >= 0); - if (copy_len > 0) { - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - copy_len); - } - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - if (wrote_start_rec) { - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - } - data_cnt += contwr ? copy_len : 0; - - error = xlog_write_copy_finish(log, iclog, optype, - &record_cnt, &data_cnt, - &partial_copy, - &partial_copy_len, - log_offset); - if (error) + if (lv->lv_niovecs && + lv->lv_bytes > iclog->ic_size - log_offset) { + error = xlog_write_partial(lv, ticket, &iclog, + &log_offset, &len, &record_cnt, + &data_cnt); + if (error) { + /* + * We have no iclog to release, so just return + * the error immediately. + */ return error; - - /* - * if we had a partial copy, we need to get more iclog - * space but we don't want to increment the region - * index because there is still more is this region to - * write. - * - * If we completed writing this region, and we flushed - * the iclog (indicated by resetting of the record - * count), then we also need to get more log space. If - * this was the last record, though, we are done and - * can just return. - */ - if (partial_copy) - break; - - if (++index == lv->lv_niovecs) { -next_lv: - lv = lv->lv_next; - index = 0; - if (lv) - vecp = lv->lv_iovecp; - } - if (record_cnt == 0 && !ordered) { - if (!lv) - return 0; - break; } + } else { + xlog_write_full(lv, ticket, iclog, &log_offset, + &len, &record_cnt, &data_cnt); } + lv = lv->lv_next; } - ASSERT(len == 0); + /* + * We've already been guaranteed that the last writes will fit inside + * the current iclog, and hence it will already have the space used by + * those writes accounted to it. Hence we do not need to update the + * iclog with the number of bytes written here. + */ spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + xlog_state_finish_copy(log, iclog, record_cnt, 0); error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); @@ -2971,7 +2889,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclogp, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp) { int log_offset; @@ -3008,9 +2925,6 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - xlog_tic_add_region(ticket, - log->l_iclog_hsize, - XLOG_REG_TYPE_LRHEADER); head->h_cycle = cpu_to_be32(log->l_curr_cycle); head->h_lsn = cpu_to_be64( xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); @@ -3052,13 +2966,10 @@ restart: * iclogs (to mark it taken), this particular iclog will release/sync * to disk in xlog_write(). */ - if (len <= iclog->ic_size - iclog->ic_offset) { - *continued_write = 0; + if (len <= iclog->ic_size - iclog->ic_offset) iclog->ic_offset += len; - } else { - *continued_write = 1; + else xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - } *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); @@ -3090,7 +3001,6 @@ xfs_log_ticket_regrant( xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); trace_xfs_log_ticket_regrant_sub(log, ticket); @@ -3101,7 +3011,6 @@ xfs_log_ticket_regrant( trace_xfs_log_ticket_regrant_exit(log, ticket); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); } xfs_log_ticket_put(ticket); @@ -3591,7 +3500,6 @@ xlog_ticket_alloc( struct xlog *log, int unit_bytes, int cnt, - char client, bool permanent) { struct xlog_ticket *tic; @@ -3609,40 +3517,14 @@ xlog_ticket_alloc( tic->t_cnt = cnt; tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); - tic->t_clientid = client; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - xlog_tic_reset_res(tic); - return tic; } #if defined(DEBUG) /* - * Make sure that the destination ptr is within the valid data region of - * one of the iclogs. This uses backup pointers stored in a different - * part of the log in case we trash the log structure. - */ -STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr) -{ - int i; - int good_ptr = 0; - - for (i = 0; i < log->l_iclog_bufs; i++) { - if (ptr >= log->l_iclog_bak[i] && - ptr <= log->l_iclog_bak[i] + log->l_iclog_size) - good_ptr++; - } - - if (!good_ptr) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); -} - -/* * Check to make sure the grant write head didn't just over lap the tail. If * the cycles are the same, we can't be overlapping. Otherwise, make sure that * the cycles differ by exactly one and check the byte count. @@ -3769,7 +3651,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3780,11 +3662,12 @@ xlog_verify_iclog( iclog->ic_header.h_cycle_data[idx]); } } - if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, - "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", - __func__, clientid, ophead, + "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx", + __func__, i, clientid, ophead, (unsigned long)field_offset); + } /* check length */ p = &ophead->oh_len; @@ -3792,8 +3675,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((uintptr_t)&ophead->oh_len - - (uintptr_t)iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,7 +3711,7 @@ xlog_verify_iclog( bool xlog_force_shutdown( struct xlog *log, - int shutdown_flags) + uint32_t shutdown_flags) { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index dc1b77b92fc1..f3ce046a7d45 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -21,46 +21,59 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) -static inline void * -xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, - uint type) +/* + * Calculate the log iovec length for a given user buffer length. Intended to be + * used by ->iop_size implementations when sizing buffers of arbitrary + * alignments. + */ +static inline int +xlog_calc_iovec_len(int len) { - struct xfs_log_iovec *vec = *vecp; + return roundup(len, sizeof(uint32_t)); +} + +void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type); - if (vec) { - ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); - vec++; - } else { - vec = &lv->lv_iovecp[0]; +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, + int data_len) +{ + struct xlog_op_header *oph = vec->i_addr; + int len; + + /* + * Always round up the length to the correct alignment so callers don't + * need to know anything about this log vec layout requirement. This + * means we have to zero the area the data to be written does not cover. + * This is complicated by fact the payload region is offset into the + * logvec region by the opheader that tracks the payload. + */ + len = xlog_calc_iovec_len(data_len); + if (len - data_len != 0) { + char *buf = vec->i_addr + sizeof(struct xlog_op_header); + + memset(buf + data_len, 0, len - data_len); } - vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; + /* + * The opheader tracks aligned payload length, whilst the logvec tracks + * the overall region length. + */ + oph->oh_len = cpu_to_be32(len); - ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + len += sizeof(struct xlog_op_header); + lv->lv_buf_len += len; + lv->lv_bytes += len; + vec->i_len = len; - *vecp = vec; - return vec->i_addr; + /* Catch buffer overruns */ + ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size); } /* - * We need to make sure the next buffer is naturally aligned for the biggest - * basic data type we put into it. We already accounted for this padding when - * sizing the buffer. - * - * However, this padding does not get written into the log, and hence we have to - * track the space used by the log vectors separately to prevent log space hangs - * due to inaccurate accounting (i.e. a leak) of the used log space through the - * CIL context ticket. + * Copy the amount of data requested by the caller into a new log iovec. */ -static inline void -xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) -{ - lv->lv_buf_len += round_up(len, sizeof(uint64_t)); - lv->lv_bytes += len; - vec->i_len = len; -} - static inline void * xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type, void *data, int len) @@ -73,6 +86,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } +static inline void * +xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + const struct xfs_log_iovec *src) +{ + return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); +} + /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number @@ -117,15 +137,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); -void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_reserve(struct xfs_mount *mp, - int length, - int count, - struct xlog_ticket **ticket, - uint8_t clientid, - bool permanent); -int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -void xfs_log_unmount(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); +int xfs_log_reserve(struct xfs_mount *mp, int length, int count, + struct xlog_ticket **ticket, bool permanent); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); +void xfs_log_unmount(struct xfs_mount *mp); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); @@ -140,9 +156,10 @@ void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); -bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); +bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); void xlog_use_incompat_feat(struct xlog *log); void xlog_drop_incompat_feat(struct xlog *log); +int xfs_attr_use_log_assist(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c9f55e4f0957..db6cb7800251 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,7 +37,7 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); + tic = xlog_ticket_alloc(log, 0, 1, 0); /* * set the current reservation to zero so we know to steal the basic @@ -48,6 +48,38 @@ xlog_cil_ticket_alloc( } /* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +static bool +xlog_item_in_current_chkpt( + struct xfs_cil *cil, + struct xfs_log_item *lip) +{ + if (list_empty(&lip->li_cil)) + return false; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); +} + +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip); +} + +/* * Unavoidable forward declaration - xlog_cil_push_work() calls * xlog_cil_ctx_alloc() itself. */ @@ -103,39 +135,6 @@ xlog_cil_iovec_space( } /* - * shadow buffers can be large, so we need to use kvmalloc() here to ensure - * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall - * back to vmalloc, so we can't actually do anything useful with gfp flags to - * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do - * direct reclaim and compaction in the slow path, both of which are - * horrendously expensive. We just want kmalloc to fail fast and fall back to - * vmalloc if it can't get somethign straight away from the free lists or buddy - * allocator. Hence we have to open code kvmalloc outselves here. - * - * Also, we are in memalloc_nofs_save task context here, so despite the use of - * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This - * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets - * just all pretend this is a GFP_KERNEL context operation.... - */ -static inline void * -xlog_cil_kvmalloc( - size_t buf_size) -{ - gfp_t flags = GFP_KERNEL; - void *p; - - flags &= ~__GFP_DIRECT_RECLAIM; - flags |= __GFP_NOWARN | __GFP_NORETRY; - do { - p = kmalloc(buf_size, flags); - if (!p) - p = vmalloc(buf_size); - } while (!p); - - return p; -} - -/* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the @@ -214,13 +213,20 @@ xlog_cil_alloc_shadow_bufs( } /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. + * We 64-bit align the length of each iovec so that the start of + * the next one is naturally aligned. We'll need to account for + * that slack space here. + * + * We also add the xlog_op_header to each region when + * formatting, but that's not accounted to the size of the item + * at this point. Hence we'll need an addition number of bytes + * for each vector to hold an opheader. + * + * Then round nbytes up to 64-bit alignment so that the initial + * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * sizeof(uint64_t); + nbytes += niovecs * + (sizeof(uint64_t) + sizeof(struct xlog_op_header)); nbytes = round_up(nbytes, sizeof(uint64_t)); /* @@ -244,7 +250,7 @@ xlog_cil_alloc_shadow_bufs( * storage. */ kmem_free(lip->li_lv_shadow); - lv = xlog_cil_kvmalloc(buf_size); + lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); @@ -277,22 +283,18 @@ xlog_cil_alloc_shadow_bufs( /* * Prepare the log item for insertion into the CIL. Calculate the difference in - * log space and vectors it will consume, and if it is a new item pin it as - * well. + * log space it will consume, and if it is a new item pin it as well. */ STATIC void xfs_cil_prepare_item( struct xlog *log, struct xfs_log_vec *lv, struct xfs_log_vec *old_lv, - int *diff_len, - int *diff_iovecs) + int *diff_len) { /* Account for the new LV being passed in */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; - *diff_iovecs += lv->lv_niovecs; - } /* * If there is no old LV, this is the first time we've seen the item in @@ -309,7 +311,6 @@ xfs_cil_prepare_item( ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; - *diff_iovecs -= old_lv->lv_niovecs; lv->lv_item->li_lv_shadow = old_lv; } @@ -358,12 +359,10 @@ static void xlog_cil_insert_format_items( struct xlog *log, struct xfs_trans *tp, - int *diff_len, - int *diff_iovecs) + int *diff_len) { struct xfs_log_item *lip; - /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); @@ -406,7 +405,6 @@ xlog_cil_insert_format_items( * set the item up as though it is a new insertion so * that the space reservation accounting is correct. */ - *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; /* Ensure the lv is set up according to ->iop_size */ @@ -431,7 +429,7 @@ xlog_cil_insert_format_items( ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); insert: - xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + xfs_cil_prepare_item(log, lv, old_lv, diff_len); } } @@ -445,13 +443,13 @@ insert: static void xlog_cil_insert_items( struct xlog *log, - struct xfs_trans *tp) + struct xfs_trans *tp, + uint32_t released_space) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; - int diff_iovecs = 0; int iclog_space; int iovhdr_res = 0, split_res = 0, ctx_res = 0; @@ -461,15 +459,10 @@ xlog_cil_insert_items( * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + xlog_cil_insert_format_items(log, tp, &len); spin_lock(&cil->xc_cil_lock); - /* account for space used by new iovec headers */ - iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t); - len += iovhdr_res; - ctx->nvecs += diff_iovecs; - /* attach the transaction to the CIL if it has any busy extents */ if (!list_empty(&tp->t_busy)) list_splice_init(&tp->t_busy, &ctx->busy_extents); @@ -500,7 +493,9 @@ xlog_cil_insert_items( ASSERT(tp->t_ticket->t_curr_res >= len); } tp->t_ticket->t_curr_res -= len; + tp->t_ticket->t_curr_res += released_space; ctx->space_used += len; + ctx->space_used -= released_space; /* * If we've overrun the reservation, dump the tx details before we move @@ -822,7 +817,8 @@ restart: static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, - struct xfs_log_vec *chain) + struct xfs_log_vec *chain, + uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; int error; @@ -830,7 +826,7 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS); + return xlog_write(log, ctx, chain, ctx->ticket, chain_len); } /* @@ -844,9 +840,14 @@ xlog_cil_write_commit_record( struct xfs_cil_ctx *ctx) { struct xlog *log = ctx->cil->xc_log; + struct xlog_op_header ophdr = { + .oh_clientid = XFS_TRANSACTION, + .oh_tid = cpu_to_be32(ctx->ticket->t_tid), + .oh_flags = XLOG_COMMIT_TRANS, + }; struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, + .i_addr = &ophdr, + .i_len = sizeof(struct xlog_op_header), .i_type = XLOG_REG_TYPE_COMMIT, }; struct xfs_log_vec vec = { @@ -862,12 +863,138 @@ xlog_cil_write_commit_record( if (error) return error; - error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); + /* account for space used by record data */ + ctx->ticket->t_curr_res -= reg.i_len; + error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } +struct xlog_cil_trans_hdr { + struct xlog_op_header oph[2]; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr[2]; +}; + +/* + * Build a checkpoint transaction header to begin the journal transaction. We + * need to account for the space used by the transaction header here as it is + * not accounted for in xlog_write(). + * + * This is the only place we write a transaction header, so we also build the + * log opheaders that indicate the start of a log transaction and wrap the + * transaction header. We keep the start record in it's own log vector rather + * than compacting them into a single region as this ends up making the logic + * in xlog_write() for handling empty opheaders for start, commit and unmount + * records much simpler. + */ +static void +xlog_cil_build_trans_hdr( + struct xfs_cil_ctx *ctx, + struct xlog_cil_trans_hdr *hdr, + struct xfs_log_vec *lvhdr, + int num_iovecs) +{ + struct xlog_ticket *tic = ctx->ticket; + __be32 tid = cpu_to_be32(tic->t_tid); + + memset(hdr, 0, sizeof(*hdr)); + + /* Log start record */ + hdr->oph[0].oh_tid = tid; + hdr->oph[0].oh_clientid = XFS_TRANSACTION; + hdr->oph[0].oh_flags = XLOG_START_TRANS; + + /* log iovec region pointer */ + hdr->lhdr[0].i_addr = &hdr->oph[0]; + hdr->lhdr[0].i_len = sizeof(struct xlog_op_header); + hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER; + + /* log opheader */ + hdr->oph[1].oh_tid = tid; + hdr->oph[1].oh_clientid = XFS_TRANSACTION; + hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header)); + + /* transaction header in host byte order format */ + hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + hdr->thdr.th_type = XFS_TRANS_CHECKPOINT; + hdr->thdr.th_tid = tic->t_tid; + hdr->thdr.th_num_items = num_iovecs; + + /* log iovec region pointer */ + hdr->lhdr[1].i_addr = &hdr->oph[1]; + hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) + + sizeof(struct xfs_trans_header); + hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR; + + lvhdr->lv_niovecs = 2; + lvhdr->lv_iovecp = &hdr->lhdr[0]; + lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; + lvhdr->lv_next = ctx->lv_chain; + + tic->t_curr_res -= lvhdr->lv_bytes; +} + +/* + * Pull all the log vectors off the items in the CIL, and remove the items from + * the CIL. We don't need the CIL lock here because it's only needed on the + * transaction commit side which is currently locked out by the flush lock. + * + * If a log item is marked with a whiteout, we do not need to write it to the + * journal and so we just move them to the whiteout list for the caller to + * dispose of appropriately. + */ +static void +xlog_cil_build_lv_chain( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx, + struct list_head *whiteouts, + uint32_t *num_iovecs, + uint32_t *num_bytes) +{ + struct xfs_log_vec *lv = NULL; + + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + + if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { + list_move(&item->li_cil, whiteouts); + trace_xfs_cil_whiteout_skip(item); + continue; + } + + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + *num_iovecs += lv->lv_niovecs; + + /* we don't write ordered log vectors */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + *num_bytes += lv->lv_bytes; + } +} + +static void +xlog_cil_cleanup_whiteouts( + struct list_head *whiteouts) +{ + while (!list_empty(whiteouts)) { + struct xfs_log_item *item = list_first_entry(whiteouts, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + trace_xfs_cil_whiteout_unpin(item); + item->li_ops->iop_unpin(item, 1); + } +} + /* * Push the Committed Item List to the log. * @@ -890,16 +1017,15 @@ xlog_cil_push_work( container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; - struct xfs_log_vec *lv; struct xfs_cil_ctx *new_ctx; - struct xlog_ticket *tic; - int num_iovecs; + int num_iovecs = 0; + int num_bytes = 0; int error = 0; - struct xfs_trans_header thdr; - struct xfs_log_iovec lhdr; + struct xlog_cil_trans_hdr thdr; struct xfs_log_vec lvhdr = { NULL }; xfs_csn_t push_seq; bool push_commit_stable; + LIST_HEAD (whiteouts); new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -968,28 +1094,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - /* - * Pull all the log vectors off the items in the CIL, and remove the - * items from the CIL. We don't need the CIL lock here because it's only - * needed on the transaction commit side which is currently locked out - * by the flush lock. - */ - lv = NULL; - num_iovecs = 0; - while (!list_empty(&cil->xc_cil)) { - struct xfs_log_item *item; - - item = list_first_entry(&cil->xc_cil, - struct xfs_log_item, li_cil); - list_del_init(&item->li_cil); - if (!ctx->lv_chain) - ctx->lv_chain = item->li_lv; - else - lv->lv_next = item->li_lv; - lv = item->li_lv; - item->li_lv = NULL; - num_iovecs += lv->lv_niovecs; - } + xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1025,26 +1130,11 @@ xlog_cil_push_work( * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). - * - * The LSN we need to pass to the log items on transaction commit is - * the LSN reported by the first log vector write. If we use the commit - * record lsn then we can move the tail beyond the grant write head. */ - tic = ctx->ticket; - thdr.th_magic = XFS_TRANS_HEADER_MAGIC; - thdr.th_type = XFS_TRANS_CHECKPOINT; - thdr.th_tid = tic->t_tid; - thdr.th_num_items = num_iovecs; - lhdr.i_addr = &thdr; - lhdr.i_len = sizeof(xfs_trans_header_t); - lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; - tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); - - lvhdr.lv_niovecs = 1; - lvhdr.lv_iovecp = &lhdr; - lvhdr.lv_next = ctx->lv_chain; - - error = xlog_cil_write_chain(ctx, &lvhdr); + xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); + num_bytes += lvhdr.lv_bytes; + + error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes); if (error) goto out_abort_free_ticket; @@ -1052,7 +1142,7 @@ xlog_cil_push_work( if (error) goto out_abort_free_ticket; - xfs_log_ticket_ungrant(log, tic); + xfs_log_ticket_ungrant(log, ctx->ticket); /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs @@ -1107,6 +1197,7 @@ xlog_cil_push_work( /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xlog_cil_cleanup_whiteouts(&whiteouts); return; out_skip: @@ -1116,8 +1207,9 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_ungrant(log, tic); + xfs_log_ticket_ungrant(log, ctx->ticket); ASSERT(xlog_is_shutdown(log)); + xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { xlog_cil_committed(ctx); return; @@ -1267,6 +1359,43 @@ xlog_cil_empty( } /* + * If there are intent done items in this transaction and the related intent was + * committed in the current (same) CIL checkpoint, we don't need to write either + * the intent or intent done item to the journal as the change will be + * journalled atomically within this checkpoint. As we cannot remove items from + * the CIL here, mark the related intent with a whiteout so that the CIL push + * can remove it rather than writing it to the journal. Then remove the intent + * done item from the current transaction and release it so it doesn't get put + * into the CIL at all. + */ +static uint32_t +xlog_cil_process_intents( + struct xfs_cil *cil, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *ilip, *next; + uint32_t len = 0; + + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE)) + continue; + + ilip = lip->li_ops->iop_intent(lip); + if (!ilip || !xlog_item_in_current_chkpt(cil, ilip)) + continue; + set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); + trace_xfs_cil_whiteout_mark(ilip); + len += ilip->li_lv->lv_bytes; + kmem_free(ilip->li_lv); + ilip->li_lv = NULL; + + xfs_trans_del_item(lip); + lip->li_ops->iop_release(lip); + } + return len; +} + +/* * Commit a transaction with the given vector to the Committed Item List. * * To do this, we need to format the item, pin it in memory if required and @@ -1288,6 +1417,7 @@ xlog_cil_commit( { struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; + uint32_t released_space = 0; /* * Do all necessary memory allocation before we lock the CIL. @@ -1299,7 +1429,10 @@ xlog_cil_commit( /* lock out background commit */ down_read(&cil->xc_ctx_lock); - xlog_cil_insert_items(log, tp); + if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE) + released_space = xlog_cil_process_intents(cil, tp); + + xlog_cil_insert_items(log, tp, released_space); if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); @@ -1456,32 +1589,6 @@ out_shutdown: } /* - * Check if the current log item was first committed in this sequence. - * We can't rely on just the log item being in the CIL, we have to check - * the recorded commit sequence number. - * - * Note: for this to be used in a non-racy manner, it has to be called with - * CIL flushing locked out. As a result, it should only be used during the - * transaction commit process when deciding what to format into the item. - */ -bool -xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) -{ - struct xfs_cil *cil = lip->li_log->l_cilp; - - if (list_empty(&lip->li_cil)) - return false; - - /* - * li_seq is written on the first commit of a log item to record the - * first checkpoint it is written to. Hence if it is different to the - * current sequence, we're in a new checkpoint. - */ - return lip->li_seq == READ_ONCE(cil->xc_current_sequence); -} - -/* * Perform initial CIL structure initialisation. */ int diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 401cdc400980..686c01eb3661 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,8 +51,8 @@ enum xlog_iclog_state { /* * In core log flags */ -#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ -#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ +#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */ #define XLOG_ICL_STRINGS \ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ @@ -62,7 +62,7 @@ enum xlog_iclog_state { /* * Log ticket flags */ -#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */ #define XLOG_TIC_FLAGS \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } @@ -142,19 +142,6 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -/* Ticket reservation region accounting */ -#define XLOG_TIC_LEN_MAX 15 - -/* - * Reservation region - * As would be stored in xfs_log_iovec but without the i_addr which - * we don't care about. - */ -typedef struct xlog_res { - uint r_len; /* region length :4 */ - uint r_type; /* region's transaction type :4 */ -} xlog_res_t; - typedef struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ @@ -164,15 +151,7 @@ typedef struct xlog_ticket { int t_unit_res; /* unit reservation in bytes : 4 */ char t_ocnt; /* original count : 1 */ char t_cnt; /* current count : 1 */ - char t_clientid; /* who does this belong to; : 1 */ - char t_flags; /* properties of reservation : 1 */ - - /* reservation array fields */ - uint t_res_num; /* num in array : 4 */ - uint t_res_num_ophdrs; /* num op hdrs : 4 */ - uint t_res_arr_sum; /* array sum : 4 */ - uint t_res_o_flow; /* sum overflow : 4 */ - xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ + uint8_t t_flags; /* properties of reservation : 1 */ } xlog_ticket_t; /* @@ -211,7 +190,7 @@ typedef struct xlog_in_core { u32 ic_offset; enum xlog_iclog_state ic_state; unsigned int ic_flags; - char *ic_datap; /* pointer to iclog data */ + void *ic_datap; /* pointer to iclog data */ struct list_head ic_callbacks; /* reference counts need their own cacheline */ @@ -242,7 +221,6 @@ struct xfs_cil_ctx { xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ - int nvecs; /* number of regions */ int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ @@ -441,10 +419,6 @@ struct xlog { struct xfs_kobj l_kobj; - /* The following field are used for debugging; need to hold icloglock */ -#ifdef DEBUG - void *l_iclog_bak[XLOG_MAX_ICLOGS]; -#endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; @@ -454,9 +428,6 @@ struct xlog { struct rw_semaphore l_incompat_users; }; -#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ @@ -509,27 +480,14 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern struct kmem_cache *xfs_log_ticket_cache; -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int count, - char client, - bool permanent); - -static inline void -xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) -{ - *ptr += bytes; - *len -= bytes; - *off += bytes; -} +struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, + int count, bool permanent); void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, - uint optype); + uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); @@ -690,4 +648,38 @@ xlog_valid_lsn( return valid; } +/* + * Log vector and shadow buffers can be large, so we need to use kvmalloc() here + * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts + * to fall back to vmalloc, so we can't actually do anything useful with gfp + * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() + * will do direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or + * buddy allocator. Hence we have to open code kvmalloc outselves here. + * + * This assumes that the caller uses memalloc_nofs_save task context here, so + * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS + * allocations. This is actually the only way to make vmalloc() do GFP_NOFS + * allocations, so lets just all pretend this is a GFP_KERNEL context + * operation.... + */ +static inline void * +xlog_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c4ad4296c540..5f7e4e6e33ce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -39,13 +39,6 @@ STATIC int xlog_clear_stale_blocks( struct xlog *, xfs_lsn_t); -#if defined(DEBUG) -STATIC void -xlog_recover_check_summary( - struct xlog *); -#else -#define xlog_recover_check_summary(log) -#endif STATIC int xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); @@ -1800,6 +1793,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_cud_item_ops, &xlog_bui_item_ops, &xlog_bud_item_ops, + &xlog_attri_item_ops, + &xlog_attrd_item_ops, }; static const struct xlog_recover_item_ops * @@ -3228,7 +3223,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error; ASSERT(head_blk != tail_blk); @@ -3236,37 +3231,25 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error; error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; } @@ -3337,8 +3320,6 @@ xlog_do_recover( } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - xlog_recover_check_summary(log); - /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; @@ -3481,7 +3462,6 @@ xlog_recover_finish( } xlog_recover_process_iunlinks(log); - xlog_recover_check_summary(log); /* * Recover any CoW staging blocks that are still referenced by the @@ -3515,52 +3495,3 @@ xlog_recover_cancel( xlog_recover_cancel_intents(log); } -#if defined(DEBUG) -/* - * Read all of the agf and agi counters and check that they - * are consistent with the superblock counters. - */ -STATIC void -xlog_recover_check_summary( - struct xlog *log) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; - - freeblks = 0LL; - itotal = 0LL; - ifree = 0LL; - for_each_perag(mp, agno, pag) { - error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); - if (error) { - xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agf *agfp = agfbp->b_addr; - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - } - - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); - if (error) { - xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agi *agi = agibp->b_addr; - - itotal += be32_to_cpu(agi->agi_count); - ifree += be32_to_cpu(agi->agi_freecount); - xfs_buf_relse(agibp); - } - } -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bc66d95c8d4c..8f495cc23903 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -27,42 +27,34 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -#define define_xfs_printk_level(func, kern_level) \ -void func(const struct xfs_mount *mp, const char *fmt, ...) \ -{ \ - struct va_format vaf; \ - va_list args; \ - int level; \ - \ - va_start(args, fmt); \ - \ - vaf.fmt = fmt; \ - vaf.va = &args; \ - \ - __xfs_printk(kern_level, mp, &vaf); \ - va_end(args); \ - \ - if (!kstrtoint(kern_level, 0, &level) && \ - level <= LOGLEVEL_ERR && \ - xfs_error_level >= XFS_ERRLEVEL_HIGH) \ - xfs_stack_trace(); \ -} \ - -define_xfs_printk_level(xfs_emerg, KERN_EMERG); -define_xfs_printk_level(xfs_alert, KERN_ALERT); -define_xfs_printk_level(xfs_crit, KERN_CRIT); -define_xfs_printk_level(xfs_err, KERN_ERR); -define_xfs_printk_level(xfs_warn, KERN_WARNING); -define_xfs_printk_level(xfs_notice, KERN_NOTICE); -define_xfs_printk_level(xfs_info, KERN_INFO); -#ifdef DEBUG -define_xfs_printk_level(xfs_debug, KERN_DEBUG); -#endif +void +xfs_printk_level( + const char *kern_level, + const struct xfs_mount *mp, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(kern_level, mp, &vaf); + + va_end(args); + + if (!kstrtoint(kern_level, 0, &level) && + level <= LOGLEVEL_ERR && + xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} void -xfs_alert_tag( +_xfs_alert_tag( const struct xfs_mount *mp, - int panic_tag, + uint32_t panic_tag, const char *fmt, ...) { struct va_format vaf; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bb9860ec9a93..cc323775a12c 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -6,33 +6,46 @@ struct xfs_mount; -extern __printf(2, 3) -void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); -extern __printf(2, 3) -void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); +void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, + const char *fmt, ...); +#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \ + xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \ +}) +#define xfs_emerg(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__) +#define xfs_alert(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__) +#define xfs_crit(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__) +#define xfs_err(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__) +#define xfs_warn(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__) +#define xfs_notice(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) +#define xfs_info(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__) #ifdef DEBUG -extern __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#define xfs_debug(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) #else -static inline __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) -{ -} +#define xfs_debug(mp, fmt, ...) do {} while (0) #endif +#define xfs_alert_tag(mp, tag, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \ + _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \ +}) + +extern __printf(3, 4) +void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag, + const char *fmt, ...); + #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ @@ -62,6 +75,12 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_mount(mp, warntag, fmt, ...) \ +do { \ + if (xfs_should_warn((mp), (warntag))) \ + xfs_warn((mp), (fmt), ##__VA_ARGS__); \ +} while (0) + #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c5f153c3693f..daa8d29c46b4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -468,6 +468,8 @@ STATIC int xfs_check_summary_counts( struct xfs_mount *mp) { + int error = 0; + /* * The AG0 superblock verifier rejects in-progress filesystems, * so we should never see the flag set this far into mounting. @@ -506,11 +508,32 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && - !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) - return 0; + if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || + xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { + error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + if (error) + return error; + } - return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + /* + * Older kernels misused sb_frextents to reflect both incore + * reservations made by running transactions and the actual count of + * free rt extents in the ondisk metadata. Transactions committed + * during runtime can therefore contain a superblock update that + * undercounts the number of free rt extents tracked in the rt bitmap. + * A clean unmount record will have the correct frextents value since + * there can be no other transactions running at that point. + * + * If we're mounting the rt volume after recovering the log, recompute + * frextents from the rtbitmap file to fix the inconsistency. + */ + if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + error = xfs_rtalloc_reinit_frextents(mp); + if (error) + return error; + } + + return 0; } /* @@ -784,11 +807,6 @@ xfs_mountfs( goto out_inodegc_shrinker; } - /* Make sure the summary counts are ok. */ - error = xfs_check_summary_counts(mp); - if (error) - goto out_log_dealloc; - /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -844,6 +862,11 @@ xfs_mountfs( goto out_rele_rip; } + /* Make sure the summary counts are ok. */ + error = xfs_check_summary_counts(mp); + if (error) + goto out_rtunmount; + /* * If this is a read-only mount defer the superblock updates until * the next remount into writeable mode. Otherwise we would never @@ -1087,24 +1110,33 @@ xfs_fs_writable( return true; } +/* Adjust m_fdblocks or m_frextents. */ int -xfs_mod_fdblocks( +xfs_mod_freecounter( struct xfs_mount *mp, + struct percpu_counter *counter, int64_t delta, bool rsvd) { int64_t lcounter; long long res_used; + uint64_t set_aside = 0; s32 batch; - uint64_t set_aside; + bool has_resv_pool; + + ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); + has_resv_pool = (counter == &mp->m_fdblocks); + if (rsvd) + ASSERT(has_resv_pool); if (delta > 0) { /* * If the reserve pool is depleted, put blocks back into it * first. Most of the time the pool is full. */ - if (likely(mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(&mp->m_fdblocks, delta); + if (likely(!has_resv_pool || + mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(counter, delta); return 0; } @@ -1116,7 +1148,7 @@ xfs_mod_fdblocks( } else { delta -= res_used; mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(&mp->m_fdblocks, delta); + percpu_counter_add(counter, delta); } spin_unlock(&mp->m_sb_lock); return 0; @@ -1130,7 +1162,7 @@ xfs_mod_fdblocks( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1147,9 +1179,10 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, + if (has_resv_pool) + set_aside = xfs_fdblocks_unavailable(mp); + percpu_counter_add_batch(counter, delta, batch); + if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; @@ -1160,8 +1193,8 @@ xfs_mod_fdblocks( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(&mp->m_fdblocks, -delta); - if (!rsvd) + percpu_counter_add(counter, -delta); + if (!has_resv_pool || !rsvd) goto fdblocks_enospc; lcounter = (long long)mp->m_resblks_avail + delta; @@ -1178,24 +1211,6 @@ fdblocks_enospc: return -ENOSPC; } -int -xfs_mod_frextents( - struct xfs_mount *mp, - int64_t delta) -{ - int64_t lcounter; - int ret = 0; - - spin_lock(&mp->m_sb_lock); - lcounter = mp->m_sb.sb_frextents + delta; - if (lcounter < 0) - ret = -ENOSPC; - else - mp->m_sb.sb_frextents = lcounter; - spin_unlock(&mp->m_sb_lock); - return ret; -} - /* * Used to free the superblock along various error paths. */ @@ -1341,7 +1356,6 @@ xfs_clear_incompat_log_features( if (xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { - xfs_info(mp, "Clearing log incompat feature flags."); xfs_sb_remove_incompat_log_features(&mp->m_sb); ret = true; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f6dc19de8322..ba5d42abf66e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,8 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + struct percpu_counter m_frextents; /* free rt extent counter */ + /* * Count of data device blocks reserved for delayed allocations, * including indlen blocks. Does not include allocated CoW staging @@ -276,6 +278,7 @@ typedef struct xfs_mount { #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ +#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) +__XFS_HAS_FEAT(large_extent_counts, NREXT64) /* * Mount features @@ -387,6 +391,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 +/* Kernel has logged a warning about online fsck being used on this fs. */ +#define XFS_OPSTATE_WARNED_SCRUB 7 +/* Kernel has logged a warning about shrink being used on this fs. */ +#define XFS_OPSTATE_WARNED_SHRINK 8 +/* Kernel has logged a warning about logged xattr updates being used. */ +#define XFS_OPSTATE_WARNED_LARP 9 + #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ { \ @@ -409,6 +420,12 @@ __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +static inline bool +xfs_should_warn(struct xfs_mount *mp, long nr) +{ + return !test_and_set_bit(nr, &mp->m_opstate); +} + #define XFS_OPSTATE_STRINGS \ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ @@ -416,7 +433,10 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ - { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ + { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } /* * Max and min values for mount-option defined I/O @@ -425,16 +445,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -#define xfs_is_shutdown(mp) xfs_is_shutdown(mp) -void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, +void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ xfs_do_force_shutdown(m, f, __FILE__, __LINE__) -#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ -#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ -#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ -#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ @@ -494,9 +513,20 @@ xfs_fdblocks_unavailable( return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } -extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, - bool reserved); -extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); +int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + int64_t delta, bool rsvd); + +static inline int +xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +{ + return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline int +xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +{ + return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); +} extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 25991923c1a8..758702b9495f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); /* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f165d1a3de1d..abf08bbf34a9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -582,9 +582,6 @@ xfs_qm_init_timelimits( defq->blk.time = XFS_QM_BTIMELIMIT; defq->ino.time = XFS_QM_ITIMELIMIT; defq->rtb.time = XFS_QM_RTBTIMELIMIT; - defq->blk.warn = XFS_QM_BWARNLIMIT; - defq->ino.warn = XFS_QM_IWARNLIMIT; - defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -608,12 +605,6 @@ xfs_qm_init_timelimits( defq->ino.time = dqp->q_ino.timer; if (dqp->q_rtb.timer) defq->rtb.time = dqp->q_rtb.timer; - if (dqp->q_blk.warnings) - defq->blk.warn = dqp->q_blk.warnings; - if (dqp->q_ino.warnings) - defq->ino.warn = dqp->q_ino.warnings; - if (dqp->q_rtb.warnings) - defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } @@ -1317,8 +1308,15 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) + if (error) { + /* + * The inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and + * tearing down the quotainfo, or else the dquots will leak. + */ + xfs_qm_dqpurge_all(mp); goto error_return; + } /* * We've made all the changes that we need to make incore. Flush them diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 5bb12717ea28..9683f0457d19 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -34,7 +34,6 @@ struct xfs_quota_limits { xfs_qcnt_t hard; /* default hard limit */ xfs_qcnt_t soft; /* default soft limit */ time64_t time; /* limit for timers */ - xfs_qwarncnt_t warn; /* limit for warnings */ }; /* Defaults for each quota type: time limits, warn limits, usage limits */ @@ -134,10 +133,6 @@ struct xfs_dquot_acct { #define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ #define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ -#define XFS_QM_BWARNLIMIT 5 -#define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_RTBWARNLIMIT 5 - extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* quota ops */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7d5a31827681..74ac9ca9e119 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -217,8 +217,7 @@ xfs_qm_scall_quotaon( return 0; } -#define XFS_QC_MASK \ - (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK) /* * Adjust limits of this quota, and the defaults if passed in. Returns true @@ -251,17 +250,6 @@ xfs_setqlim_limits( } static inline void -xfs_setqlim_warns( - struct xfs_dquot_res *res, - struct xfs_quota_limits *qlim, - int warns) -{ - res->warnings = warns; - if (qlim) - qlim->warn = warns; -} - -static inline void xfs_setqlim_timer( struct xfs_mount *mp, struct xfs_dquot_res *res, @@ -354,8 +342,6 @@ xfs_qm_scall_setqlim( if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); - if (newlim->d_fieldmask & QC_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); @@ -370,8 +356,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->rtb : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); @@ -386,8 +370,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->ino : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); - if (newlim->d_fieldmask & QC_INO_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); @@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_count = dqp->q_ino.reserved; dst->d_spc_timer = dqp->q_blk.timer; dst->d_ino_timer = dqp->q_ino.timer; - dst->d_ino_warns = dqp->q_ino.warnings; - dst->d_spc_warns = dqp->q_blk.warnings; + dst->d_ino_warns = 0; + dst->d_spc_warns = 0; dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = dqp->q_rtb.timer; - dst->d_rt_spc_warns = dqp->q_rtb.warnings; + dst->d_rt_spc_warns = 0; /* * Internally, we don't reset all the timers when quota enforcement diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 07989bd67728..9c162e69976b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -40,9 +40,9 @@ xfs_qm_fill_state( tstate->spc_timelimit = (u32)defq->blk.time; tstate->ino_timelimit = (u32)defq->ino.time; tstate->rt_spc_timelimit = (u32)defq->rtb.time; - tstate->spc_warnlimit = defq->blk.warn; - tstate->ino_warnlimit = defq->ino.warn; - tstate->rt_spc_warnlimit = defq->rtb.warn; + tstate->spc_warnlimit = 0; + tstate->ino_warnlimit = 0; + tstate->rt_spc_warnlimit = 0; if (tempqip) xfs_irele(ip); } @@ -98,7 +98,7 @@ xfs_quota_type(int type) } } -#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK) /* * Adjust quota timers & warnings diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 0d868c93144d..7e97bf19793d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -35,6 +35,7 @@ STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { + kmem_free(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else @@ -53,10 +54,11 @@ xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } + if (!atomic_dec_and_test(&cuip->cui_refcount)) + return; + + xfs_trans_ail_delete(&cuip->cui_item, 0); + xfs_cui_item_free(cuip); } @@ -204,14 +206,24 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); + kmem_free(cudp->cud_item.li_lv_shadow); kmem_cache_free(xfs_cud_cache, cudp); } +static struct xfs_log_item * +xfs_cud_item_intent( + struct xfs_log_item *lip) +{ + return &CUD_ITEM(lip)->cud_cuip->cui_item; +} + static const struct xfs_item_ops xfs_cud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_cud_item_size, .iop_format = xfs_cud_item_format, .iop_release = xfs_cud_item_release, + .iop_intent = xfs_cud_item_intent, }; static struct xfs_cud_log_item * @@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update( * 1.) releases the CUI and frees the CUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; @@ -600,6 +612,7 @@ xfs_cui_item_relog( } static const struct xfs_item_ops xfs_cui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 54e68e5693fd..e7a7c00d93be 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -586,21 +586,21 @@ out: STATIC int xfs_reflink_end_cow_extent( struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - xfs_fileoff_t *end_fsb) + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) { - struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - xfs_filblks_t rlen; unsigned int resblks; + int nmaps; int error; /* No COW extents? That's easy! */ if (ifp->if_bytes == 0) { - *end_fsb = offset_fsb; + *offset_fsb = end_fsb; return 0; } @@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_REFLINK_END_COW_CNT); if (error) goto out_cancel; @@ -628,42 +631,66 @@ xfs_reflink_end_cow_extent( * left by the time I/O completes for the loser of the race. In that * case we are done. */ - if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || - got.br_startoff + got.br_blockcount <= offset_fsb) { - *end_fsb = offset_fsb; + if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; goto out_cancel; } /* - * Structure copy @got into @del, then trim @del to the range that we - * were asked to remap. We preserve @got for the eventual CoW fork - * deletion; from now on @del represents the mapping that we're - * actually remapping. - */ - del = got; - xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); - - ASSERT(del.br_blockcount > 0); - - /* * Only remap real extents that contain data. With AIO, speculative * preallocations can leak into the range we are called upon, and we - * need to skip them. + * need to skip them. Preserve @got for the eventual CoW fork + * deletion; from now on @del represents the mapping that we're + * actually remapping. */ - if (!xfs_bmap_is_written_extent(&got)) { - *end_fsb = del.br_startoff; - goto out_cancel; + while (!xfs_bmap_is_written_extent(&got)) { + if (!xfs_iext_next_extent(ifp, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; + goto out_cancel; + } } + del = got; - /* Unmap the old blocks in the data fork. */ - rlen = del.br_blockcount; - error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); + /* Grab the corresponding mapping in the data fork. */ + nmaps = 1; + error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, + &nmaps, 0); if (error) goto out_cancel; - /* Trim the extent to whatever got unmapped. */ - xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); - trace_xfs_reflink_cow_remap(ip, &del); + /* We can only remap the smaller of the two extent sizes. */ + data.br_blockcount = min(data.br_blockcount, del.br_blockcount); + del.br_blockcount = data.br_blockcount; + + trace_xfs_reflink_cow_remap_from(ip, &del); + trace_xfs_reflink_cow_remap_to(ip, &data); + + if (xfs_bmap_is_real_extent(&data)) { + /* + * If the extent we're remapping is backed by storage (written + * or not), unmap the extent and drop its refcount. + */ + xfs_bmap_unmap_extent(tp, ip, &data); + xfs_refcount_decrease_extent(tp, &data); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + -data.br_blockcount); + } else if (data.br_startblock == DELAYSTARTBLOCK) { + int done; + + /* + * If the extent we're remapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = xfs_bunmapi(NULL, ip, data.br_startoff, + data.br_blockcount, 0, 1, &done); + if (error) + goto out_cancel; + ASSERT(done); + } /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); @@ -684,7 +711,7 @@ xfs_reflink_end_cow_extent( return error; /* Update the caller about how much progress we made. */ - *end_fsb = del.br_startoff; + *offset_fsb = del.br_startoff + del.br_blockcount; return 0; out_cancel: @@ -712,7 +739,7 @@ xfs_reflink_end_cow( end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* - * Walk backwards until we're out of the I/O range. The loop function + * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * @@ -744,7 +771,7 @@ xfs_reflink_end_cow( * blocks will be remapped. */ while (end_fsb > offset_fsb && !error) - error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); + error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb); if (error) trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); @@ -1121,6 +1148,8 @@ xfs_reflink_remap_extent( ++iext_delta; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto out_cancel; @@ -1133,7 +1162,7 @@ xfs_reflink_remap_extent( xfs_refcount_decrease_extent(tp, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { - xfs_filblks_t len = smap.br_blockcount; + int done; /* * If the extent we're unmapping is a delalloc reservation, @@ -1141,10 +1170,11 @@ xfs_reflink_remap_extent( * incore state. Dropping the delalloc reservation takes care * of the quota reservation for us. */ - error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + error = xfs_bunmapi(NULL, ip, smap.br_startoff, + smap.br_blockcount, 0, 1, &done); if (error) goto out_cancel; - ASSERT(len == 0); + ASSERT(done); } /* diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a22b2d19ef91..fef92e02f3bb 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -35,6 +35,7 @@ STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { + kmem_free(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else @@ -53,10 +54,11 @@ xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } + if (!atomic_dec_and_test(&ruip->rui_refcount)) + return; + + xfs_trans_ail_delete(&ruip->rui_item, 0); + xfs_rui_item_free(ruip); } STATIC void @@ -227,14 +229,24 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); + kmem_free(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } +static struct xfs_log_item * +xfs_rud_item_intent( + struct xfs_log_item *lip) +{ + return &RUD_ITEM(lip)->rud_ruip->rui_item; +} + static const struct xfs_item_ops xfs_rud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_rud_item_size, .iop_format = xfs_rud_item_format, .iop_release = xfs_rud_item_release, + .iop_intent = xfs_rud_item_intent, }; static struct xfs_rud_log_item * @@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update( * 1.) releases the RUI and frees the RUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; @@ -630,6 +642,7 @@ xfs_rui_item_relog( } static const struct xfs_item_ops xfs_rui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b8c79ee791af..292d5e54a92c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -806,6 +806,9 @@ xfs_growfs_rt_alloc( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -1284,6 +1287,44 @@ xfs_rtmount_init( return 0; } +static int +xfs_rtalloc_count_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + uint64_t *valp = priv; + + *valp += rec->ar_extcount; + return 0; +} + +/* + * Reinitialize the number of free realtime extents from the realtime bitmap. + * Callers must ensure that there is no other activity in the filesystem. + */ +int +xfs_rtalloc_reinit_frextents( + struct xfs_mount *mp) +{ + uint64_t val = 0; + int error; + + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, + &val); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_frextents = val; + spin_unlock(&mp->m_sb_lock); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + return 0; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 91b00289509b..62c7ad79cbb6 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,6 +22,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_trans *tp, +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, bool *is_free); +int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a276b8111f63..ed18160e6181 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,8 @@ #include "xfs_pwork.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -843,9 +845,11 @@ xfs_fs_statfs( if (XFS_IS_REALTIME_MOUNT(mp) && (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + s64 freertx; + statp->f_blocks = sbp->sb_rblocks; - statp->f_bavail = statp->f_bfree = - sbp->sb_frextents * sbp->sb_rextsize; + freertx = percpu_counter_sum_positive(&mp->m_frextents); + statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; } return 0; @@ -1015,8 +1019,14 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc; + return 0; +free_delalloc: + percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: percpu_counter_destroy(&mp->m_fdblocks); free_ifree: @@ -1033,6 +1043,7 @@ xfs_reinit_percpu_counters( percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); } static void @@ -1045,6 +1056,7 @@ xfs_destroy_percpu_counters( ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); + percpu_counter_destroy(&mp->m_frextents); } static int @@ -1635,6 +1647,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_large_extent_counts(mp)) + xfs_warn(mp, + "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -2065,8 +2081,24 @@ xfs_init_caches(void) if (!xfs_bui_cache) goto out_destroy_bud_cache; + xfs_attrd_cache = kmem_cache_create("xfs_attrd_item", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + if (!xfs_attrd_cache) + goto out_destroy_bui_cache; + + xfs_attri_cache = kmem_cache_create("xfs_attri_item", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + if (!xfs_attri_cache) + goto out_destroy_attrd_cache; + return 0; + out_destroy_attrd_cache: + kmem_cache_destroy(xfs_attrd_cache); + out_destroy_bui_cache: + kmem_cache_destroy(xfs_bui_cache); out_destroy_bud_cache: kmem_cache_destroy(xfs_bud_cache); out_destroy_cui_cache: @@ -2113,6 +2145,8 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_attri_cache); + kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); kmem_cache_destroy(xfs_bud_cache); kmem_cache_destroy(xfs_cui_cache); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 167d23f92ffe..3cd5a51bace1 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -91,7 +91,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; -extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index affbedf78160..4145ba872547 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -226,11 +226,6 @@ xfs_symlink( goto out_trans_cancel; } - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * Allocate an inode for the symlink. */ diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 7692e76ead33..f78ad6b10ea5 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -83,6 +83,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { #ifdef DEBUG int pwork_threads; /* parallel workqueue threads */ + bool larp; /* log attribute replay */ #endif int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 574b80c29fe1..f7faf6e70d7f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -228,6 +228,29 @@ pwork_threads_show( return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); + +static ssize_t +larp_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.larp); + if (ret < 0) + return ret; + return count; +} + +STATIC ssize_t +larp_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); +} +XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ static struct attribute *xfs_dbg_attrs[] = { @@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(always_cow), #ifdef DEBUG ATTR_LIST(pwork_threads), + ATTR_LIST(larp), #endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b141ef78c755..d32026585c1b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __field(unsigned, lockval) __field(unsigned, flags) __field(unsigned long, caller_ip) + __field(const void *, buf_ops) ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; @@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; + __entry->buf_ops = bp->b_ops; ), TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " - "lock %d flags %s caller %pS", + "lock %d flags %s bufops %pS caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->pincount, __entry->lockval, __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + __entry->buf_ops, (void *)__entry->caller_ip) ) @@ -1096,22 +1099,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); -#define XFS_QMOPT_FLAGS \ - { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ - { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ - { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ - { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ - { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ - { XFS_QMOPT_INHERIT, "INHERIT" }, \ - { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ - { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ - { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ - { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ - { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ - { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ - { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ - { XFS_QMOPT_RES_INOS, "RES_INOS" } - TRACE_EVENT(xfs_trans_mod_dquot, TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, unsigned int field, int64_t delta), @@ -1348,6 +1335,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), @@ -1924,7 +1914,7 @@ DECLARE_EVENT_CLASS(xfs_da_class, __field(int, namelen) __field(xfs_dahash_t, hashval) __field(xfs_ino_t, inumber) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -1990,7 +1980,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(xfs_dahash_t, hashval) __field(unsigned int, attr_filter) __field(unsigned int, attr_flags) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -2097,7 +2087,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, idx) ), TP_fast_assign( @@ -2128,7 +2118,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, src_idx) __field(int, dst_idx) __field(int, count) @@ -2169,7 +2159,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(int, which) __field(xfs_ino_t, ino) __field(int, format) - __field(int, nex) + __field(xfs_extnum_t, nex) __field(int, broot_size) __field(int, fork_off) ), @@ -2182,7 +2172,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), - TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, " "broot size %d, forkoff 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -3418,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -3513,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); -TRACE_EVENT(xfs_trans_resv_calc, +DECLARE_EVENT_CLASS(xfs_trans_resv_class, TP_PROTO(struct xfs_mount *mp, unsigned int type, struct xfs_trans_res *res), TP_ARGS(mp, type, res), @@ -3537,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logres, __entry->logcount, __entry->logflags) +) + +#define DEFINE_TRANS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_trans_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, unsigned int type, \ + struct xfs_trans_res *res), \ + TP_ARGS(mp, type, res)) +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc); +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize); + +TRACE_EVENT(xfs_log_get_max_trans_res, + TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res), + TP_ARGS(mp, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint, logres) + __field(int, logcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->logres = res->tr_logres; + __entry->logcount = res->tr_logcount; + ), + TP_printk("dev %d:%d logres %u logcount %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->logres, + __entry->logcount) ); DECLARE_EVENT_CLASS(xfs_trans_class, @@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); +TRACE_DEFINE_ENUM(XFS_DAS_UNINIT); +TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_DONE); + DECLARE_EVENT_CLASS(xfs_das_state_class, TP_PROTO(int das, struct xfs_inode *ip), TP_ARGS(das, ip), @@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class, __entry->das = das; __entry->ino = ip->i_ino; ), - TP_printk("state change %d ino 0x%llx", - __entry->das, __entry->ino) + TP_printk("state change %s ino 0x%llx", + __print_symbolic(__entry->das, XFS_DAS_STRINGS), + __entry->ino) ) #define DEFINE_DAS_STATE_EVENT(name) \ @@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \ TP_ARGS(das, ip)) DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); + TRACE_EVENT(xfs_force_shutdown, TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0ac717aad380..82cf0189c0db 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -32,7 +32,6 @@ static void xfs_trans_trace_reservations( struct xfs_mount *mp) { - struct xfs_trans_res resv; struct xfs_trans_res *res; struct xfs_trans_res *end_res; int i; @@ -41,8 +40,6 @@ xfs_trans_trace_reservations( end_res = (struct xfs_trans_res *)(M_RES(mp) + 1); for (i = 0; res < end_res; i++, res++) trace_xfs_trans_resv_calc(mp, i, res); - xfs_log_get_max_trans_res(mp, &resv); - trace_xfs_trans_resv_calc(mp, -1, &resv); } #else # define xfs_trans_trace_reservations(mp) @@ -194,11 +191,9 @@ xfs_trans_reserve( ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(mp, - resp->tr_logres, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, - &tp->t_ticket, XFS_TRANSACTION, - permanent); + &tp->t_ticket, permanent); } if (error) @@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas( be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); - if (tp->t_res_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + /* + * Updating frextents requires careful handling because it does not + * behave like the lazysb counters because we cannot rely on log + * recovery in older kenels to recompute the value from the rtbitmap. + * This means that the ondisk frextents must be consistent with the + * rtbitmap. + * + * Therefore, log the frextents change to the ondisk superblock and + * update the incore superblock so that future calls to xfs_log_sb + * write the correct value ondisk. + * + * Don't touch m_frextents because it includes incore reservations, + * and those are handled by the unreserve function. + */ + if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + struct xfs_mount *mp = tp->t_mountp; + int64_t rtxdelta; + + rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; + + spin_lock(&mp->m_sb_lock); + be64_add_cpu(&sbp->sb_frextents, rtxdelta); + mp->m_sb.sb_frextents += rtxdelta; + spin_unlock(&mp->m_sb_lock); + } if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); @@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + if (rtxdelta) { + error = xfs_mod_frextents(mp, rtxdelta); + ASSERT(!error); + } + + if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; /* apply remaining deltas */ @@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; - mp->m_sb.sb_frextents += rtxdelta; + /* + * Do not touch sb_frextents here because we are dealing with incore + * reservation. sb_frextents is not part of the lazy sb counters so it + * must be consistent with the ondisk rtbitmap and must never include + * incore reservations. + */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 0c82673238f4..9561f193e7e1 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -55,13 +55,15 @@ struct xfs_log_item { #define XFS_LI_IN_AIL 0 #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 -#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_DIRTY 3 +#define XFS_LI_WHITEOUT 4 #define XFS_LI_FLAGS \ - { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ - { (1 << XFS_LI_ABORTED), "ABORTED" }, \ - { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1u << XFS_LI_ABORTED), "ABORTED" }, \ + { (1u << XFS_LI_FAILED), "FAILED" }, \ + { (1u << XFS_LI_DIRTY), "DIRTY" }, \ + { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } struct xfs_item_ops { unsigned flags; @@ -78,30 +80,32 @@ struct xfs_item_ops { bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, struct xfs_trans *tp); + struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; -/* Is this log item a deferred action intent? */ +/* + * Log item ops flags + */ +/* + * Release the log item when the journal commits instead of inserting into the + * AIL for writeback tracking and/or log tail pinning. + */ +#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) +#define XFS_ITEM_INTENT (1 << 1) +#define XFS_ITEM_INTENT_DONE (1 << 2) + static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT; } -/* Is this a log intent-done item? */ static inline bool xlog_item_is_intent_done(struct xfs_log_item *lip) { - return lip->li_ops->iop_unpin == NULL && - lip->li_ops->iop_push == NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT_DONE; } -/* - * Release the log item as soon as committed. This is for items just logging - * intents that never need to be written back in place. - */ -#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) - void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, const struct xfs_item_ops *ops); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9ba7e6b9bed3..aa00cf67ad72 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -597,13 +597,11 @@ xfs_dqresv_check( if (softlimit && total_count > softlimit) { time64_t now = ktime_get_real_seconds(); - if ((res->timer != 0 && now > res->timer) || - (res->warnings != 0 && res->warnings >= qlim->warn)) { + if (res->timer != 0 && now > res->timer) { *fatal = true; return QUOTA_NL_ISOFTLONGWARN; } - res->warnings++; return QUOTA_NL_ISOFTWARN; } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0d050f8829ef..c325a28b89a8 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,12 +12,104 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" -#include "xfs_da_btree.h" +#include "xfs_log.h" +#include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +static inline int +xfs_attr_grab_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, + "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} + +static inline void +xfs_attr_rele_log_assist( + struct xfs_mount *mp) +{ + xlog_drop_incompat_feat(mp->m_log); +} + +static inline bool +xfs_attr_want_log_assist( + struct xfs_mount *mp) +{ +#ifdef DEBUG + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; +#else + return false; +#endif +} + +/* + * Set or remove an xattr, having grabbed the appropriate logging resources + * prior to calling libxfs. + */ +int +xfs_attr_change( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + bool use_logging = false; + int error; + + ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + + if (xfs_attr_want_log_assist(mp)) { + error = xfs_attr_grab_log_assist(mp); + if (error) + return error; + + args->op_flags |= XFS_DA_OP_LOGGED; + use_logging = true; + } + + error = xfs_attr_set(args); + + if (use_logging) + xfs_attr_rele_log_assist(mp); + return error; +} + static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, @@ -56,7 +148,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h new file mode 100644 index 000000000000..2b09133b1b9b --- /dev/null +++ b/fs/xfs/xfs_xattr.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_XATTR_H__ +#define __XFS_XATTR_H__ + +int xfs_attr_change(struct xfs_da_args *args); + +extern const struct xattr_handler *xfs_xattr_handlers[]; + +#endif /* __XFS_XATTR_H__ */ diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bcb21aea990a..053299758deb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -110,15 +110,51 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) } } -static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; loff_t isize; - /* All I/Os should always be within the file maximum size */ + /* + * All blocks are always mapped below EOF. If reading past EOF, + * act as if there is a hole up to the file maximum size. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { + .iomap_begin = zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ if (WARN_ON_ONCE(offset + length > zi->i_max_size)) return -EIO; @@ -128,7 +164,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * operation. */ if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT))) + !(flags & IOMAP_DIRECT))) return -EIO; /* @@ -137,47 +173,44 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * write pointer) and unwriten beyond. */ mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; isize = i_size_read(inode); - if (offset >= isize) + if (iomap->offset >= isize) { iomap->type = IOMAP_UNWRITTEN; - else + iomap->length = zi->i_max_size - iomap->offset; + } else { iomap->type = IOMAP_MAPPED; - if (flags & IOMAP_WRITE) - length = zi->i_max_size - offset; - else - length = min(length, isize - offset); + iomap->length = isize - iomap->offset; + } mutex_unlock(&zi->i_truncate_mutex); - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset; - iomap->bdev = inode->i_sb->s_bdev; - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - trace_zonefs_iomap_begin(inode, iomap); return 0; } -static const struct iomap_ops zonefs_iomap_ops = { - .iomap_begin = zonefs_iomap_begin, +static const struct iomap_ops zonefs_write_iomap_ops = { + .iomap_begin = zonefs_write_iomap_begin, }; static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_iomap_ops); + return iomap_read_folio(folio, &zonefs_read_iomap_ops); } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_read_iomap_ops); } /* * Map blocks for page writeback. This is used only on conventional zone files, * which implies that the page range can only be within the fixed inode size. */ -static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) { struct zonefs_inode_info *zi = ZONEFS_I(inode); @@ -191,12 +224,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, offset < wpc->iomap.offset + wpc->iomap.length) return 0; - return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset, - IOMAP_WRITE, &wpc->iomap, NULL); + return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, + IOMAP_WRITE, &wpc->iomap, NULL); } static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_map_blocks, + .map_blocks = zonefs_write_map_blocks, }; static int zonefs_writepage(struct page *page, struct writeback_control *wbc) @@ -226,7 +259,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, return -EINVAL; } - return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); + return iomap_swapfile_activate(sis, swap_file, span, + &zonefs_read_iomap_ops); } static const struct address_space_operations zonefs_file_aops = { @@ -647,7 +681,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); @@ -899,7 +933,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) if (append) ret = zonefs_file_dio_append(iocb, from); else - ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, &zonefs_write_dio_ops, 0, NULL, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { @@ -948,7 +982,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, if (ret <= 0) goto inode_unlock; - ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); if (ret > 0) iocb->ki_pos += ret; else if (ret == -EIO) @@ -1041,7 +1075,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) goto inode_unlock; } file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, + ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, &zonefs_read_dio_ops, 0, NULL, 0); } else { ret = generic_file_read_iter(iocb, to); @@ -1085,7 +1119,8 @@ static int zonefs_seq_file_write_open(struct inode *inode) if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - if (wro > sbi->s_max_wro_seq_files) { + if (sbi->s_max_wro_seq_files + && wro > sbi->s_max_wro_seq_files) { atomic_dec(&sbi->s_wro_seq_files); ret = -EBUSY; goto unlock; @@ -1760,12 +1795,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) atomic_set(&sbi->s_wro_seq_files, 0); sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); - if (!sbi->s_max_wro_seq_files && - sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; - } - atomic_set(&sbi->s_active_seq_files, 0); sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); @@ -1790,6 +1819,14 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) zonefs_info(sb, "Mounting %u zones", blkdev_nr_zones(sb->s_bdev->bd_disk)); + if (!sbi->s_max_wro_seq_files && + !sbi->s_max_active_seq_files && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, + "No open and active zone limits. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } + /* Create root directory inode */ ret = -ENOMEM; inode = new_inode(sb); |