diff options
Diffstat (limited to 'fs')
373 files changed, 8957 insertions, 6981 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 64c58eb26159..9eb34701a566 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -55,42 +55,27 @@ int v9fs_random_cachetag(struct v9fs_session_info *v9ses) return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); } -static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct v9fs_session_info *v9ses; - uint16_t klen = 0; - - v9ses = (struct v9fs_session_info *)cookie_netfs_data; - p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n", - v9ses, buffer, bufmax); - - if (v9ses->cachetag) - klen = strlen(v9ses->cachetag); - - if (klen > bufmax) - return 0; - - memcpy(buffer, v9ses->cachetag, klen); - p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag); - return klen; -} - const struct fscache_cookie_def v9fs_cache_session_index_def = { .name = "9P.session", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = v9fs_cache_session_get_key, }; void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) { /* If no cache session tag was specified, we generate a random one. */ - if (!v9ses->cachetag) - v9fs_random_cachetag(v9ses); + if (!v9ses->cachetag) { + if (v9fs_random_cachetag(v9ses) < 0) { + v9ses->fscache = NULL; + return; + } + } v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, - v9ses, true); + v9ses->cachetag, + strlen(v9ses->cachetag), + NULL, 0, + v9ses, 0, true); p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", v9ses, v9ses->fscache); } @@ -99,45 +84,15 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) { p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", v9ses, v9ses->fscache); - fscache_relinquish_cookie(v9ses->fscache, 0); + fscache_relinquish_cookie(v9ses->fscache, NULL, false); v9ses->fscache = NULL; } - -static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); - p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n", - &v9inode->vfs_inode, v9inode->qid.path); - return sizeof(v9inode->qid.path); -} - -static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - *size = i_size_read(&v9inode->vfs_inode); - - p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n", - &v9inode->vfs_inode, *size); -} - -static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); - p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n", - &v9inode->vfs_inode, v9inode->qid.version); - return sizeof(v9inode->qid.version); -} - static enum fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, const void *buffer, - uint16_t buflen) + uint16_t buflen, + loff_t object_size) { const struct v9fs_inode *v9inode = cookie_netfs_data; @@ -154,9 +109,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, const struct fscache_cookie_def v9fs_cache_inode_index_def = { .name = "9p.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = v9fs_cache_inode_get_key, - .get_attr = v9fs_cache_inode_get_attr, - .get_aux = v9fs_cache_inode_get_aux, .check_aux = v9fs_cache_inode_check_aux, }; @@ -175,7 +127,13 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode, true); + &v9inode->qid.path, + sizeof(v9inode->qid.path), + &v9inode->qid.version, + sizeof(v9inode->qid.version), + v9inode, + i_size_read(&v9inode->vfs_inode), + true); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); @@ -190,7 +148,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode) p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", inode, v9inode->fscache); - fscache_relinquish_cookie(v9inode->fscache, 0); + fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version, + false); v9inode->fscache = NULL; } @@ -203,7 +162,7 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", inode, v9inode->fscache); - fscache_relinquish_cookie(v9inode->fscache, 1); + fscache_relinquish_cookie(v9inode->fscache, NULL, true); v9inode->fscache = NULL; } @@ -236,12 +195,18 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) old = v9inode->fscache; mutex_lock(&v9inode->fscache_lock); - fscache_relinquish_cookie(v9inode->fscache, 1); + fscache_relinquish_cookie(v9inode->fscache, NULL, true); v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode, true); + &v9inode->qid.path, + sizeof(v9inode->qid.path), + &v9inode->qid.version, + sizeof(v9inode->qid.version), + v9inode, + i_size_read(&v9inode->vfs_inode), + true); p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); @@ -367,7 +332,8 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) const struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); - ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); + ret = fscache_write_page(v9inode->fscache, page, + i_size_read(&v9inode->vfs_inode), GFP_KERNEL); p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); if (ret != 0) v9fs_uncache_page(inode, page); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 8fb89ddc6cc7..e622f0f10502 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -292,6 +292,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); v9ses->cachetag = match_strdup(&args[0]); + if (!v9ses->cachetag) { + ret = -ENOMEM; + goto free_and_return; + } #endif break; case Opt_cache: @@ -471,6 +475,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return fid; err_clnt: +#ifdef CONFIG_9P_FSCACHE + kfree(v9ses->cachetag); +#endif p9_client_destroy(v9ses->clnt); err_names: kfree(v9ses->uname); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index bdabb2765d1b..9ee534159cc6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -579,6 +579,24 @@ static int v9fs_at_to_dotl_flags(int flags) } /** + * v9fs_dec_count - helper functon to drop i_nlink. + * + * If a directory had nlink <= 2 (including . and ..), then we should not drop + * the link count, which indicates the underlying exported fs doesn't maintain + * nlink accurately. e.g. + * - overlayfs sets nlink to 1 for merged dir + * - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more + * than EXT4_LINK_MAX (65000) links. + * + * @inode: inode whose nlink is being dropped + */ +static void v9fs_dec_count(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); +} + +/** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted @@ -621,9 +639,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) */ if (flags & AT_REMOVEDIR) { clear_nlink(inode); - drop_nlink(dir); + v9fs_dec_count(dir); } else - drop_nlink(inode); + v9fs_dec_count(inode); v9fs_invalidate_inode_attr(inode); v9fs_invalidate_inode_attr(dir); @@ -1024,12 +1042,12 @@ clunk_newdir: if (S_ISDIR(new_inode->i_mode)) clear_nlink(new_inode); else - drop_nlink(new_inode); + v9fs_dec_count(new_inode); } if (S_ISDIR(old_inode->i_mode)) { if (!new_inode) inc_nlink(new_dir); - drop_nlink(old_dir); + v9fs_dec_count(old_dir); } v9fs_invalidate_inode_attr(old_inode); v9fs_invalidate_inode_attr(old_dir); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index af03c2a901eb..48ce50484e80 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -94,7 +94,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; - sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME; + sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) sb->s_flags |= SB_SYNCHRONOUS; diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 58c2bbd385ad..57a27c42b5ac 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,6 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" - depends on MMU && (BROKEN || !FRV) + depends on MMU select ELFCORE default y ---help--- @@ -35,7 +35,7 @@ config ARCH_BINFMT_ELF_STATE config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + depends on (ARM || (SUPERH32 && !MMU) || C6X) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -90,7 +90,6 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU || ARM || M68K - depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/Makefile b/fs/Makefile index add789ea270a..c9375fd2c8c4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ ioctl.o readdir.o select.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o splice.o sync.o utimes.o \ + pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o ifeq ($(CONFIG_BLOCK),y) diff --git a/fs/afs/cache.c b/fs/afs/cache.c index f62ff71d28c9..b1c31ec4523a 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -12,167 +12,39 @@ #include <linux/sched.h> #include "internal.h" -static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); - -static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, - uint64_t *size); -static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, - uint16_t buflen); + uint16_t buflen, + loff_t object_size); struct fscache_netfs afs_cache_netfs = { .name = "afs", - .version = 1, + .version = 2, }; struct fscache_cookie_def afs_cell_cache_index_def = { .name = "AFS.cell", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_cell_cache_get_key, }; struct fscache_cookie_def afs_volume_cache_index_def = { .name = "AFS.volume", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_volume_cache_get_key, }; struct fscache_cookie_def afs_vnode_cache_index_def = { - .name = "AFS.vnode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = afs_vnode_cache_get_key, - .get_attr = afs_vnode_cache_get_attr, - .get_aux = afs_vnode_cache_get_aux, - .check_aux = afs_vnode_cache_check_aux, + .name = "AFS.vnode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .check_aux = afs_vnode_cache_check_aux, }; /* - * set the key for the index entry - */ -static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_cell *cell = cookie_netfs_data; - uint16_t klen; - - _enter("%p,%p,%u", cell, buffer, bufmax); - - klen = strlen(cell->name); - if (klen > bufmax) - return 0; - - memcpy(buffer, cell->name, klen); - return klen; -} - -/*****************************************************************************/ -/* - * set the key for the volume index entry - */ -static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_volume *volume = cookie_netfs_data; - struct { - u64 volid; - } __packed key; - - _enter("{%u},%p,%u", volume->type, buffer, bufmax); - - if (bufmax < sizeof(key)) - return 0; - - key.volid = volume->vid; - memcpy(buffer, &key, sizeof(key)); - return sizeof(key); -} - -/*****************************************************************************/ -/* - * set the key for the index entry - */ -static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - struct { - u32 vnode_id[3]; - } __packed key; - - _enter("{%x,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, bufmax); - - /* Allow for a 96-bit key */ - memset(&key, 0, sizeof(key)); - key.vnode_id[0] = vnode->fid.vnode; - key.vnode_id[1] = 0; - key.vnode_id[2] = 0; - - if (sizeof(key) > bufmax) - return 0; - - memcpy(buffer, &key, sizeof(key)); - return sizeof(key); -} - -/* - * provide updated file attributes - */ -static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - - _enter("{%x,%x,%llx},", - vnode->fid.vnode, vnode->fid.unique, - vnode->status.data_version); - - *size = vnode->status.size; -} - -struct afs_vnode_cache_aux { - u64 data_version; - u32 fid_unique; -} __packed; - -/* - * provide new auxiliary cache data - */ -static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - struct afs_vnode_cache_aux aux; - - _enter("{%x,%x,%Lx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, bufmax); - - memset(&aux, 0, sizeof(aux)); - aux.data_version = vnode->status.data_version; - aux.fid_unique = vnode->fid.unique; - - if (bufmax < sizeof(aux)) - return 0; - - memcpy(buffer, &aux, sizeof(aux)); - return sizeof(aux); -} - -/* * check that the auxiliary data indicates that the entry is still valid */ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, - uint16_t buflen) + uint16_t buflen, + loff_t object_size) { struct afs_vnode *vnode = cookie_netfs_data; struct afs_vnode_cache_aux aux; @@ -189,12 +61,6 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; } - if (vnode->fid.unique != aux.fid_unique) { - _leave(" = OBSOLETE [uniq %x != %x]", - aux.fid_unique, vnode->fid.unique); - return FSCACHE_CHECKAUX_OBSOLETE; - } - if (vnode->status.data_version != aux.data_version) { _leave(" = OBSOLETE [vers %llx != %llx]", aux.data_version, vnode->status.data_version); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 9bb921d120d0..4235a05afc76 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -25,7 +25,7 @@ static void afs_manage_cell(struct work_struct *); static void afs_dec_cells_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->cells_outstanding)) - wake_up_atomic_t(&net->cells_outstanding); + wake_up_var(&net->cells_outstanding); } /* @@ -522,7 +522,9 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) #ifdef CONFIG_AFS_FSCACHE cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, - cell, true); + cell->name, strlen(cell->name), + NULL, 0, + cell, 0, true); #endif ret = afs_proc_cell_setup(net, cell); if (ret < 0) @@ -547,7 +549,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) spin_unlock(&net->proc_cells_lock); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(cell->cache, 0); + fscache_relinquish_cookie(cell->cache, NULL, false); cell->cache = NULL; #endif @@ -764,7 +766,7 @@ void afs_cell_purge(struct afs_net *net) afs_queue_cell_manager(net); _debug("wait"); - wait_on_atomic_t(&net->cells_outstanding, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&net->cells_outstanding, + !atomic_read(&net->cells_outstanding)); _leave(""); } diff --git a/fs/afs/file.c b/fs/afs/file.c index a39192ced99e..79e665a35fea 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -339,7 +339,8 @@ int afs_page_filler(void *data, struct page *page) /* send the page to the cache */ #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_write_page(vnode->cache, page, vnode->status.size, + GFP_KERNEL) != 0) { fscache_uncache_page(vnode->cache, page); BUG_ON(PageFsCache(page)); } @@ -403,7 +404,8 @@ static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req) /* send the page to the cache */ #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_write_page(vnode->cache, page, vnode->status.size, + GFP_KERNEL) != 0) { fscache_uncache_page(vnode->cache, page); BUG_ON(PageFsCache(page)); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6b39d0255b72..65c5b1edd338 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -243,6 +243,33 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) } /* + * Get a cache cookie for an inode. + */ +static void afs_get_inode_cache(struct afs_vnode *vnode) +{ +#ifdef CONFIG_AFS_FSCACHE + struct { + u32 vnode_id; + u32 unique; + u32 vnode_id_ext[2]; /* Allow for a 96-bit key */ + } __packed key; + struct afs_vnode_cache_aux aux; + + key.vnode_id = vnode->fid.vnode; + key.unique = vnode->fid.unique; + key.vnode_id_ext[0] = 0; + key.vnode_id_ext[1] = 0; + aux.data_version = vnode->status.data_version; + + vnode->cache = fscache_acquire_cookie(vnode->volume->cache, + &afs_vnode_cache_index_def, + &key, sizeof(key), + &aux, sizeof(aux), + vnode, vnode->status.size, true); +#endif +} + +/* * inode retrieval */ struct inode *afs_iget(struct super_block *sb, struct key *key, @@ -307,11 +334,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* set up caching before mapping the status, as map-status reads the * first page of symlinks to see if they're really mountpoints */ inode->i_size = vnode->status.size; -#ifdef CONFIG_AFS_FSCACHE - vnode->cache = fscache_acquire_cookie(vnode->volume->cache, - &afs_vnode_cache_index_def, - vnode, true); -#endif + afs_get_inode_cache(vnode); ret = afs_inode_map_status(vnode, key); if (ret < 0) @@ -327,7 +350,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* failure */ bad_inode: #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, 0); + fscache_relinquish_cookie(vnode->cache, NULL, ret == -ENOENT); vnode->cache = NULL; #endif iget_failed(inode); @@ -343,6 +366,10 @@ void afs_zap_data(struct afs_vnode *vnode) { _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); +#ifdef CONFIG_AFS_FSCACHE + fscache_invalidate(vnode->cache); +#endif + /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a * directory or symlink */ @@ -507,8 +534,14 @@ void afs_evict_inode(struct inode *inode) } #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, 0); - vnode->cache = NULL; + { + struct afs_vnode_cache_aux aux; + + aux.data_version = vnode->status.data_version; + fscache_relinquish_cookie(vnode->cache, &aux, + test_bit(AFS_VNODE_DELETED, &vnode->flags)); + vnode->cache = NULL; + } #endif afs_put_permits(vnode->permit_cache); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f38d6a561a84..a6a1d75eee41 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -118,6 +118,7 @@ struct afs_call { bool ret_reply0; /* T if should return reply[0] on success */ bool upgrade; /* T to request service upgrade */ u16 service_id; /* Actual service ID (after upgrade) */ + unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ @@ -558,6 +559,13 @@ struct afs_fs_cursor { #define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ }; +/* + * Cache auxiliary data. + */ +struct afs_vnode_cache_aux { + u64 data_version; +} __packed; + #include <trace/events/afs.h> /*****************************************************************************/ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e1126659f043..f7ae54b6a393 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -103,8 +103,8 @@ void afs_close_socket(struct afs_net *net) } _debug("outstanding %u", atomic_read(&net->nr_outstanding_calls)); - wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&net->nr_outstanding_calls, + !atomic_read(&net->nr_outstanding_calls)); _debug("no outstanding calls"); kernel_sock_shutdown(net->socket, SHUT_RDWR); @@ -131,6 +131,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, call->type = type; call->net = net; + call->debug_id = atomic_inc_return(&rxrpc_debug_id); atomic_set(&call->usage, 1); INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); @@ -169,13 +170,14 @@ void afs_put_call(struct afs_call *call) afs_put_server(call->net, call->cm_server); afs_put_cb_interest(call->net, call->cbi); kfree(call->request); - kfree(call); - o = atomic_dec_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_free, 0, o, __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); if (o == 0) - wake_up_atomic_t(&net->nr_outstanding_calls); + wake_up_var(&net->nr_outstanding_calls); } } @@ -378,7 +380,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, (async ? afs_wake_up_async_call : afs_wake_up_call_waiter), - call->upgrade); + call->upgrade, + call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); goto error_kill_call; @@ -727,7 +730,8 @@ void afs_charge_preallocation(struct work_struct *work) afs_wake_up_async_call, afs_rx_attach, (unsigned long)call, - GFP_KERNEL) < 0) + GFP_KERNEL, + call->debug_id) < 0) break; call = NULL; } diff --git a/fs/afs/server.c b/fs/afs/server.c index 1880f1b6a9f1..a43ef77dabae 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -25,7 +25,7 @@ static void afs_inc_servers_outstanding(struct afs_net *net) static void afs_dec_servers_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->servers_outstanding)) - wake_up_atomic_t(&net->servers_outstanding); + wake_up_var(&net->servers_outstanding); } /* @@ -521,8 +521,8 @@ void afs_purge_servers(struct afs_net *net) afs_queue_server_manager(net); _debug("wait"); - wait_on_atomic_t(&net->servers_outstanding, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&net->servers_outstanding, + !atomic_read(&net->servers_outstanding)); _leave(""); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index b517a588781f..3037bd01f617 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -225,7 +225,9 @@ void afs_activate_volume(struct afs_volume *volume) #ifdef CONFIG_AFS_FSCACHE volume->cache = fscache_acquire_cookie(volume->cell->cache, &afs_volume_cache_index_def, - volume, true); + &volume->vid, sizeof(volume->vid), + NULL, 0, + volume, 0, true); #endif write_lock(&volume->cell->proc_lock); @@ -245,7 +247,7 @@ void afs_deactivate_volume(struct afs_volume *volume) write_unlock(&volume->cell->proc_lock); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, + fscache_relinquish_cookie(volume->cache, NULL, test_bit(AFS_VOLUME_DELETED, &volume->flags)); volume->cache = NULL; #endif @@ -68,9 +68,9 @@ struct aio_ring { #define AIO_RING_PAGES 8 struct kioctx_table { - struct rcu_head rcu; - unsigned nr; - struct kioctx *table[]; + struct rcu_head rcu; + unsigned nr; + struct kioctx __rcu *table[]; }; struct kioctx_cpu { @@ -115,7 +115,7 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct work_struct free_work; + struct rcu_work free_rwork; /* see free_ioctx() */ /* * signals when all in-flight requests are done @@ -329,7 +329,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) for (i = 0; i < table->nr; i++) { struct kioctx *ctx; - ctx = table->table[i]; + ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; @@ -588,10 +588,15 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) return cancel(&kiocb->common); } +/* + * free_ioctx() should be RCU delayed to synchronize against the RCU + * protected lookup_ioctx() and also needs process context to call + * aio_free_ring(). Use rcu_work. + */ static void free_ioctx(struct work_struct *work) { - struct kioctx *ctx = container_of(work, struct kioctx, free_work); - + struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, + free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); @@ -609,8 +614,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref) if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + /* Synchronize against RCU protected table->table[] dereferences */ + INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); + queue_rcu_work(system_wq, &ctx->free_rwork); } /* @@ -651,9 +657,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) while (1) { if (table) for (i = 0; i < table->nr; i++) - if (!table->table[i]) { + if (!rcu_access_pointer(table->table[i])) { ctx->id = i; - table->table[i] = ctx; + rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -834,11 +840,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, } table = rcu_dereference_raw(mm->ioctx_table); - WARN_ON(ctx != table->table[ctx->id]); - table->table[ctx->id] = NULL; + WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); + RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); - /* percpu_ref_kill() will do the necessary call_rcu() */ + /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* @@ -880,7 +886,8 @@ void exit_aio(struct mm_struct *mm) skipped = 0; for (i = 0; i < table->nr; ++i) { - struct kioctx *ctx = table->table[i]; + struct kioctx *ctx = + rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; @@ -1069,7 +1076,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; - ctx = table->table[id]; + ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { percpu_ref_get(&ctx->users); ret = ctx; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index b7c816f39404..26f6b4f41ce6 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -310,7 +310,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - return sys_close(param->ioctlfd); + return ksys_close(param->ioctlfd); } /* diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a7c5a9861bef..a41b48f82a70 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -241,7 +241,7 @@ ret: return retval; error: if (fd_binary > 0) - sys_close(fd_binary); + ksys_close(fd_binary); bprm->interp_flags = 0; bprm->interp_data = 0; goto ret; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a181fcb5175..7a506c55a993 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1058,6 +1058,27 @@ retry: return 0; } +static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) +{ + struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); + + if (!disk) + return NULL; + /* + * Now that we hold gendisk reference we make sure bdev we looked up is + * not stale. If it is, it means device got removed and created before + * we looked up gendisk and we fail open in such case. Associating + * unhashed bdev with newly created gendisk could lead to two bdevs + * (and thus two independent caches) being associated with one device + * which is bad. + */ + if (inode_unhashed(bdev->bd_inode)) { + put_disk_and_module(disk); + return NULL; + } + return disk; +} + /** * bd_start_claiming - start claiming a block device * @bdev: block device of interest @@ -1094,7 +1115,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, * @bdev might not have been initialized properly yet, look up * and grab the outer block device the hard way. */ - disk = get_gendisk(bdev->bd_dev, &partno); + disk = bdev_get_gendisk(bdev, &partno); if (!disk) return ERR_PTR(-ENXIO); @@ -1111,8 +1132,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, else whole = bdgrab(bdev); - module_put(disk->fops->owner); - put_disk(disk); + put_disk_and_module(disk); if (!whole) return ERR_PTR(-ENOMEM); @@ -1304,7 +1324,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * @bdev: struct bdev to adjust. * * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) { @@ -1317,7 +1338,8 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", disk->disk_name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev, false); + if (bdev_size > disk_size) + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1407,10 +1429,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; - struct module *owner; int ret; int partno; int perm = 0; + bool first_open = false; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1430,14 +1452,14 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) restart: ret = -ENXIO; - disk = get_gendisk(bdev->bd_dev, &partno); + disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; - owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { + first_open = true; bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; @@ -1463,8 +1485,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); goto restart; } } @@ -1524,15 +1545,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_unlock_bdev; } - /* only one opener holds refs to the module and disk */ - put_disk(disk); - module_put(owner); } bdev->bd_openers++; if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); + /* only one opener holds refs to the module and disk */ + if (!first_open) + put_disk_and_module(disk); return 0; out_clear: @@ -1546,8 +1567,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); out: bdput(bdev); @@ -1770,8 +1790,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk->fops->release(disk, mode); } if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; @@ -1779,8 +1797,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 273351ee4c46..167e5dc7eadd 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,7 +1,6 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select CRYPTO - select CRYPTO_CRC32C + select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0c4373628eb4..ca693dd554e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1ba49ebe67da..0066d95b133f 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -46,12 +46,12 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) BUG(); } - size = __btrfs_getxattr(inode, name, "", 0); + size = btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); - size = __btrfs_getxattr(inode, name, value, size); + size = btrfs_getxattr(inode, name, value, size); } if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); @@ -65,9 +65,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -/* - * Needs to be called with fs_mutex held - */ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { @@ -101,7 +98,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = __btrfs_setxattr(trans, inode, name, value, size, 0); + ret = btrfs_setxattr(trans, inode, name, value, size, 0); out: kfree(value); @@ -127,11 +124,6 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; } -/* - * btrfs_init_acl is already generally called under fs_mutex, so the locking - * stuff has been fixed to work with that. If the locking stuff changes, we - * need to re-evaluate the acl locking stuff. - */ int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f94b2d8c744a..571024bc632e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -170,7 +170,7 @@ int __init btrfs_prelim_ref_init(void) return 0; } -void btrfs_prelim_ref_exit(void) +void __cold btrfs_prelim_ref_exit(void) { kmem_cache_destroy(btrfs_prelim_ref_cache); } @@ -738,7 +738,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0); + eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0, + ref->level - 1, NULL); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -773,15 +774,12 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; struct btrfs_key tmp_op_key; - struct btrfs_key *op_key = NULL; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) { + if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - op_key = &tmp_op_key; - } spin_lock(&head->lock); for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { @@ -1291,7 +1289,8 @@ again: ref->level == 0) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref->parent, 0); + eb = read_tree_block(fs_info, ref->parent, 0, + ref->level, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; @@ -1519,6 +1518,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) if (!node) break; bytenr = node->val; + shared.share_count = 0; cond_resched(); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 0c2fab8514ff..0a30028d5196 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -73,7 +73,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); int __init btrfs_prelim_ref_init(void); -void btrfs_prelim_ref_exit(void); +void __cold btrfs_prelim_ref_exit(void); struct prelim_ref { struct rb_node rbnode; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 63f0ccc92a71..ca15be569d69 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -195,7 +195,6 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; - long delayed_iput_count; /* * To avoid races between lockless (i_mutex not held) direct IO writes @@ -365,6 +364,4 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, logical_start, csum, csum_expected, mirror_num); } -bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); - #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7d51b5a5b505..3baebbc021c5 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -96,9 +96,9 @@ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> +#include <linux/crc32c.h> #include "ctree.h" #include "disk-io.h" -#include "hash.h" #include "transaction.h" #include "extent_io.h" #include "volumes.h" @@ -1736,7 +1736,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, size_t sublen = i ? PAGE_SIZE : (PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = btrfs_crc32c(crc, data, sublen); + crc = crc32c(crc, data, sublen); } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 07d049c0c20f..562c3e633403 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1133,7 +1133,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void btrfs_exit_compress(void) +void __cold btrfs_exit_compress(void) { free_workspaces(); } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 677fa4aa0bd7..ce796557a918 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -76,7 +76,7 @@ struct compressed_bio { }; void __init btrfs_init_compress(void); -void btrfs_exit_compress(void); +void __cold btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b88a79e69ddf..a2c9d21176e2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -41,8 +41,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb); struct btrfs_path *btrfs_alloc_path(void) { @@ -301,11 +299,6 @@ enum mod_log_op { MOD_LOG_ROOT_REPLACE, }; -struct tree_mod_move { - int dst_slot; - int nr_items; -}; - struct tree_mod_root { u64 logical; u8 level; @@ -328,32 +321,15 @@ struct tree_mod_elem { u64 blockptr; /* this is used for op == MOD_LOG_MOVE_KEYS */ - struct tree_mod_move move; + struct { + int dst_slot; + int nr_items; + } move; /* this is used for op == MOD_LOG_ROOT_REPLACE */ struct tree_mod_root old_root; }; -static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) -{ - read_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) -{ - read_unlock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) -{ - write_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) -{ - write_unlock(&fs_info->tree_mod_log_lock); -} - /* * Pull a new tree mod seq number for our operation. */ @@ -373,14 +349,14 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - tree_mod_log_write_lock(fs_info); + write_lock(&fs_info->tree_mod_log_lock); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } spin_unlock(&fs_info->tree_mod_seq_lock); - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; } @@ -422,7 +398,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - tree_mod_log_write_lock(fs_info); + write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); @@ -432,7 +408,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, rb_erase(node, tm_root); kfree(tm); } - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); } /* @@ -443,7 +419,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * for root replace operations, or the logical address of the affected * block for all other operations. * - * Note: must be called with write lock (tree_mod_log_write_lock). + * Note: must be called with write lock for fs_info::tree_mod_log_lock. */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -481,7 +457,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it * returns zero with the tree_mod_log_lock acquired. The caller must hold * this until all tree mod log insertions are recorded in the rb tree and then - * call tree_mod_log_write_unlock() to release. + * write unlock fs_info::tree_mod_log_lock. */ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { @@ -491,9 +467,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, if (eb && btrfs_header_level(eb) == 0) return 1; - tree_mod_log_write_lock(fs_info); + write_lock(&fs_info->tree_mod_log_lock); if (list_empty(&(fs_info)->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); return 1; } @@ -536,38 +512,34 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot, return tm; } -static noinline int -tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { struct tree_mod_elem *tm; int ret; - if (!tree_mod_need_log(fs_info, eb)) + if (!tree_mod_need_log(eb->fs_info, eb)) return 0; tm = alloc_tree_mod_elem(eb, slot, op, flags); if (!tm) return -ENOMEM; - if (tree_mod_dont_log(fs_info, eb)) { + if (tree_mod_dont_log(eb->fs_info, eb)) { kfree(tm); return 0; } - ret = __tree_mod_log_insert(fs_info, tm); - tree_mod_log_write_unlock(fs_info); + ret = __tree_mod_log_insert(eb->fs_info, tm); + write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) kfree(tm); return ret; } -static noinline int -tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items) +static noinline int tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, int nr_items) { struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; @@ -575,7 +547,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, int i; int locked = 0; - if (!tree_mod_need_log(fs_info, eb)) + if (!tree_mod_need_log(eb->fs_info, eb)) return 0; tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); @@ -603,7 +575,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, } } - if (tree_mod_dont_log(fs_info, eb)) + if (tree_mod_dont_log(eb->fs_info, eb)) goto free_tms; locked = 1; @@ -613,26 +585,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert(fs_info, tm_list[i]); + ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]); if (ret) goto free_tms; } - ret = __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(eb->fs_info, tm); if (ret) goto free_tms; - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list); return 0; free_tms: for (i = 0; i < nr_items; i++) { if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); kfree(tm_list[i]); } if (locked) - tree_mod_log_write_unlock(fs_info); + write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list); kfree(tm); @@ -660,12 +632,10 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, return 0; } -static noinline int -tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, - struct extent_buffer *old_root, - struct extent_buffer *new_root, - int log_removal) +static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, int log_removal) { + struct btrfs_fs_info *fs_info = old_root->fs_info; struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; int nritems = 0; @@ -713,7 +683,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (!ret) ret = __tree_mod_log_insert(fs_info, tm); - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); if (ret) goto free_tms; kfree(tm_list); @@ -740,7 +710,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct tree_mod_elem *cur = NULL; struct tree_mod_elem *found = NULL; - tree_mod_log_read_lock(fs_info); + read_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { @@ -768,7 +738,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, break; } } - tree_mod_log_read_unlock(fs_info); + read_unlock(&fs_info->tree_mod_log_lock); return found; } @@ -849,7 +819,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, goto free_tms; } - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); return 0; @@ -861,36 +831,13 @@ free_tms: kfree(tm_list[i]); } if (locked) - tree_mod_log_write_unlock(fs_info); + write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); return ret; } -static inline void -tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, - int dst_offset, int src_offset, int nr_items) -{ - int ret; - ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items); - BUG_ON(ret < 0); -} - -static noinline void -tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, int atomic) -{ - int ret; - - ret = tree_mod_log_insert_key(fs_info, eb, slot, - MOD_LOG_KEY_REPLACE, - atomic ? GFP_ATOMIC : GFP_NOFS); - BUG_ON(ret < 0); -} - -static noinline int -tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static noinline int tree_mod_log_free_eb(struct extent_buffer *eb) { struct tree_mod_elem **tm_list = NULL; int nritems = 0; @@ -900,7 +847,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) if (btrfs_header_level(eb) == 0) return 0; - if (!tree_mod_need_log(fs_info, NULL)) + if (!tree_mod_need_log(eb->fs_info, NULL)) return 0; nritems = btrfs_header_nritems(eb); @@ -917,11 +864,11 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) } } - if (tree_mod_dont_log(fs_info, eb)) + if (tree_mod_dont_log(eb->fs_info, eb)) goto free_tms; - ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); - tree_mod_log_write_unlock(fs_info); + ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); + write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) goto free_tms; kfree(tm_list); @@ -936,17 +883,6 @@ free_tms: return ret; } -static noinline void -tree_mod_log_set_root_pointer(struct btrfs_root *root, - struct extent_buffer *new_root_node, - int log_removal) -{ - int ret; - ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, log_removal); - BUG_ON(ret < 0); -} - /* * check if the tree block can be shared by multiple trees */ @@ -1173,7 +1109,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; extent_buffer_get(cow); - tree_mod_log_set_root_pointer(root, cow, 1); + ret = tree_mod_log_insert_root(root->node, cow, 1); + BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, @@ -1182,7 +1119,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - tree_mod_log_insert_key(fs_info, parent, parent_slot, + tree_mod_log_insert_key(parent, parent_slot, MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); @@ -1190,7 +1127,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); if (last_ref) { - ret = tree_mod_log_free_eb(fs_info, buf); + ret = tree_mod_log_free_eb(buf); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -1211,9 +1148,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, * returns the logical address of the oldest predecessor of the given root. * entries older than time_seq are ignored. */ -static struct tree_mod_elem * -__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb_root, u64 time_seq) +static struct tree_mod_elem *__tree_mod_log_oldest_root( + struct extent_buffer *eb_root, u64 time_seq) { struct tree_mod_elem *tm; struct tree_mod_elem *found = NULL; @@ -1230,7 +1166,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, * first operation that's logged for this root. */ while (1) { - tm = tree_mod_log_search_oldest(fs_info, root_logical, + tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, time_seq); if (!looped && !tm) return NULL; @@ -1279,7 +1215,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); - tree_mod_log_read_lock(fs_info); + read_lock(&fs_info->tree_mod_log_lock); while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for @@ -1334,7 +1270,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, if (tm->logical != first_tm->logical) break; } - tree_mod_log_read_unlock(fs_info); + read_unlock(&fs_info->tree_mod_log_lock); btrfs_set_header_nritems(eb, n); } @@ -1418,9 +1354,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; + int level; eb_root = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq); + tm = __tree_mod_log_oldest_root(eb_root, time_seq); if (!tm) return eb_root; @@ -1428,15 +1365,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq) old_root = &tm->old_root; old_generation = tm->generation; logical = old_root->logical; + level = old_root->level; } else { logical = eb_root->start; + level = btrfs_header_level(eb_root); } tm = tree_mod_log_search(fs_info, logical, time_seq); if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, 0); + old = read_tree_block(fs_info, logical, 0, level, NULL); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); @@ -1484,7 +1423,7 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) int level; struct extent_buffer *eb_root = btrfs_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); + tm = __tree_mod_log_oldest_root(eb_root, time_seq); if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { level = tm->old_root.level; } else { @@ -1502,8 +1441,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, if (btrfs_is_testing(root->fs_info)) return 0; - /* ensure we can see the force_cow */ - smp_rmb(); + /* Ensure we can see the FORCE_COW bit */ + smp_mb__before_atomic(); /* * We do not need to cow a block if @@ -1656,6 +1595,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(parent); for (i = start_slot; i <= end_slot; i++) { + struct btrfs_key first_key; int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1665,6 +1605,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, progress_passed = 1; blocknr = btrfs_node_blockptr(parent, i); gen = btrfs_node_ptr_generation(parent, i); + btrfs_node_key_to_cpu(parent, &first_key, i); if (last_block == 0) last_block = blocknr; @@ -1688,7 +1629,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, uptodate = 0; if (!cur || !uptodate) { if (!cur) { - cur = read_tree_block(fs_info, blocknr, gen); + cur = read_tree_block(fs_info, blocknr, gen, + parent_level - 1, + &first_key); if (IS_ERR(cur)) { return PTR_ERR(cur); } else if (!extent_buffer_uptodate(cur)) { @@ -1696,7 +1639,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, return -EIO; } } else if (!uptodate) { - err = btrfs_read_buffer(cur, gen); + err = btrfs_read_buffer(cur, gen, + parent_level - 1,&first_key); if (err) { free_extent_buffer(cur); return err; @@ -1849,14 +1793,17 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, { int level = btrfs_header_level(parent); struct extent_buffer *eb; + struct btrfs_key first_key; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); BUG_ON(level == 0); + btrfs_node_key_to_cpu(parent, &first_key, slot); eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot), - btrfs_node_ptr_generation(parent, slot)); + btrfs_node_ptr_generation(parent, slot), + level - 1, &first_key); if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); eb = ERR_PTR(-EIO); @@ -1928,7 +1875,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_set_root_pointer(root, child, 1); + ret = tree_mod_log_insert_root(root->node, child, 1); + BUG_ON(ret < 0); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -2007,8 +1955,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); - tree_mod_log_set_node_key(fs_info, parent, - pslot + 1, 0); + ret = tree_mod_log_insert_key(parent, pslot + 1, + MOD_LOG_KEY_REPLACE, GFP_NOFS); + BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -2052,7 +2001,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(fs_info, parent, pslot, 0); + ret = tree_mod_log_insert_key(parent, pslot, + MOD_LOG_KEY_REPLACE, GFP_NOFS); + BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -2153,7 +2104,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); - tree_mod_log_set_node_key(fs_info, parent, pslot, 0); + ret = tree_mod_log_insert_key(parent, pslot, + MOD_LOG_KEY_REPLACE, GFP_NOFS); + BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -2207,8 +2160,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); - tree_mod_log_set_node_key(fs_info, parent, - pslot + 1, 0); + ret = tree_mod_log_insert_key(parent, pslot + 1, + MOD_LOG_KEY_REPLACE, GFP_NOFS); + BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2445,10 +2399,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, u64 gen; struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; + struct btrfs_key first_key; int ret; + int parent_level; blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); + parent_level = btrfs_header_level(b); + btrfs_node_key_to_cpu(b, &first_key, slot); tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { @@ -2467,7 +2425,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_set_path_blocking(p); /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen); + ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); if (!ret) { *eb_ret = tmp; return 0; @@ -2494,7 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(fs_info, blocknr, 0); + tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1, + &first_key); if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, @@ -3161,13 +3120,17 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info, { int i; struct extent_buffer *t; + int ret; for (i = level; i < BTRFS_MAX_LEVEL; i++) { int tslot = path->slots[i]; + if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(fs_info, t, tslot, 1); + ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE, + GFP_ATOMIC); + BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -3264,8 +3227,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * don't call tree_mod_log_eb_move here, key removal was already - * fully logged by tree_mod_log_eb_copy above. + * Don't call tree_mod_log_insert_move here, key removal was + * already fully logged by tree_mod_log_eb_copy above. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), @@ -3320,7 +3283,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; - tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems); + ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); + BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * @@ -3363,6 +3327,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct extent_buffer *c; struct extent_buffer *old; struct btrfs_disk_key lower_key; + int ret; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); @@ -3401,7 +3366,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - tree_mod_log_set_root_pointer(root, c, 0); + ret = tree_mod_log_insert_root(root->node, c, 0); + BUG_ON(ret < 0); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -3438,17 +3404,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(slot > nritems); BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info)); if (slot != nritems) { - if (level) - tree_mod_log_eb_move(fs_info, lower, slot + 1, - slot, nritems - slot); + if (level) { + ret = tree_mod_log_insert_move(lower, slot + 1, slot, + nritems - slot); + BUG_ON(ret < 0); + } memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { - ret = tree_mod_log_insert_key(fs_info, lower, slot, - MOD_LOG_KEY_ADD, GFP_NOFS); + ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD, + GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -4911,17 +4879,19 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { - if (level) - tree_mod_log_eb_move(fs_info, parent, slot, - slot + 1, nritems - slot - 1); + if (level) { + ret = tree_mod_log_insert_move(parent, slot, slot + 1, + nritems - slot - 1); + BUG_ON(ret < 0); + } memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { - ret = tree_mod_log_insert_key(fs_info, parent, slot, - MOD_LOG_KEY_REMOVE, GFP_NOFS); + ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE, + GFP_NOFS); BUG_ON(ret < 0); } @@ -5145,9 +5115,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * into min_key, so you can call btrfs_search_slot with cow=1 on the * key and get a writable path. * - * This does lock as it descends, and path->keep_locks should be set - * to 1 by the caller. - * * This honors path->lowest_level to prevent descent past a given level * of the tree. * diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1a462ab85c49..0eb55825862a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -40,6 +40,7 @@ #include <linux/sizes.h> #include <linux/dynamic_debug.h> #include <linux/refcount.h> +#include <linux/crc32c.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -65,6 +66,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 +#define BTRFS_OLDEST_GENERATION 0ULL + #define BTRFS_COMPAT_EXTENT_TREE_V0 /* @@ -86,9 +89,9 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U +/* four bytes for CRC32 */ static const int btrfs_csum_sizes[] = { 4 }; -/* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 /* ioprio of readahead is set to idle */ @@ -98,6 +101,7 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M + /* * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size */ @@ -381,8 +385,9 @@ struct btrfs_dev_replace { /* For raid type sysfs entries */ struct raid_kobject { - int raid_type; + u64 flags; struct kobject kobj; + struct list_head list; }; struct btrfs_space_info { @@ -707,7 +712,6 @@ struct btrfs_delayed_root; #define BTRFS_FS_LOG_RECOVERING 4 #define BTRFS_FS_OPEN 5 #define BTRFS_FS_QUOTA_ENABLED 6 -#define BTRFS_FS_QUOTA_ENABLING 7 #define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 #define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 #define BTRFS_FS_BTREE_ERR 11 @@ -788,7 +792,7 @@ struct btrfs_fs_info { unsigned long pending_changes; unsigned long compress_type:4; unsigned int compress_level; - int commit_interval; + u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a * wrong number because we will write out the data into a regular @@ -877,7 +881,6 @@ struct btrfs_fs_info { struct rb_root tree_mod_log; atomic_t async_delalloc_pages; - atomic_t open_ioctl_trans; /* * this is used to protect the following list -- ordered_roots. @@ -935,9 +938,11 @@ struct btrfs_fs_info { struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; - int thread_pool_size; + u32 thread_pool_size; struct kobject *space_info_kobj; + struct list_head pending_raid_kobjs; + spinlock_t pending_raid_kobjs_lock; /* uncontended */ u64 total_pinned; @@ -952,9 +957,9 @@ struct btrfs_fs_info { struct btrfs_fs_devices *fs_devices; /* - * the space_info list is almost entirely read only. It only changes - * when we add a new raid type to the FS, and that happens - * very rarely. RCU is used to protect it. + * The space_info list is effectively read only after initial + * setup. It is populated at mount time and cleaned up after + * all block groups are removed. RCU is used to protect it. */ struct list_head space_info; @@ -993,8 +998,8 @@ struct btrfs_fs_info { struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; - unsigned data_chunk_allocations; - unsigned metadata_ratio; + u32 data_chunk_allocations; + u32 metadata_ratio; void *bdev_holder; @@ -1260,12 +1265,13 @@ struct btrfs_root { struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshotted; - /* For qgroup metadata space reserve */ - atomic64_t qgroup_meta_rsv; + /* For qgroup metadata reserved space */ + spinlock_t qgroup_meta_rsv_lock; + u64 qgroup_meta_rsv_pertrans; + u64 qgroup_meta_rsv_prealloc; }; struct btrfs_file_private { - struct btrfs_trans_handle *trans; void *filldir_buf; }; @@ -2554,6 +2560,20 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, + int len) +{ + return (u64) crc32c(parent_objectid, name, len); +} + static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && @@ -2608,7 +2628,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, unsigned long count); + unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -2628,7 +2648,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( u64 bytenr); void btrfs_get_block_group(struct btrfs_block_group_cache *cache); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); -int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2668,15 +2687,13 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, @@ -2688,6 +2705,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); +void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); @@ -2697,8 +2715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); @@ -2730,11 +2747,10 @@ int btrfs_check_data_free_space(struct inode *inode, void btrfs_free_reserved_data_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); -void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); @@ -2745,10 +2761,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); @@ -2792,7 +2810,6 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); @@ -2974,7 +2991,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); - kfree(fs_info); + kvfree(fs_info); } /* tree mod log functions from ctree.c */ @@ -3095,7 +3112,10 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -int btrfs_find_name_in_ext_backref(struct btrfs_path *path, +int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, + const char *name, + int name_len, struct btrfs_inode_ref **ref_ret); +int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret); @@ -3192,8 +3212,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); -void btrfs_destroy_cachep(void); -long btrfs_ioctl_trans_end(struct file *file); +void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, @@ -3243,7 +3262,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, /* file.c */ int __init btrfs_auto_defrag_init(void); -void btrfs_auto_defrag_exit(void); +void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); @@ -3278,25 +3297,23 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int __init btrfs_init_sysfs(void); -void btrfs_exit_sysfs(void); +void __cold btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); -/* xattr.c */ -ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); -static inline __printf(2, 3) +static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } #ifdef CONFIG_PRINTK __printf(2, 3) +__cold void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #else #define btrfs_printk(fs_info, fmt, args...) \ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0530f6f2e4ba..86ec2edc05e8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -23,6 +23,7 @@ #include "disk-io.h" #include "transaction.h" #include "ctree.h" +#include "qgroup.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -42,7 +43,7 @@ int __init btrfs_delayed_inode_init(void) return 0; } -void btrfs_delayed_inode_exit(void) +void __cold btrfs_delayed_inode_exit(void) { kmem_cache_destroy(delayed_node_cache); } @@ -552,11 +553,12 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; u64 num_bytes; int ret; @@ -578,15 +580,17 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info, +static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *rsv; + struct btrfs_fs_info *fs_info = root->fs_info; if (!item->bytes_reserved) return; rsv = &fs_info->delayed_block_rsv; + btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved); trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); @@ -611,6 +615,9 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + if (ret < 0) + return ret; /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we @@ -630,8 +637,10 @@ static int btrfs_delayed_inode_reserve_metadata( * EAGAIN to make us stop the transaction we have, so return * ENOSPC instead so that btrfs_dirty_inode knows what to do. */ - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = -ENOSPC; + btrfs_qgroup_free_meta_prealloc(root, num_bytes); + } if (!ret) { node->bytes_reserved = num_bytes; trace_btrfs_space_reservation(fs_info, @@ -653,7 +662,8 @@ static int btrfs_delayed_inode_reserve_metadata( } static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + bool qgroup_free) { struct btrfs_block_rsv *rsv; @@ -665,6 +675,12 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(node->root, + node->bytes_reserved); + else + btrfs_qgroup_convert_reserved_meta(node->root, + node->bytes_reserved); node->bytes_reserved = 0; } @@ -766,7 +782,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, curr->data_len); slot++; - btrfs_delayed_item_release_metadata(fs_info, curr); + btrfs_delayed_item_release_metadata(root, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); @@ -788,7 +804,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *delayed_item) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; char *ptr; int ret; @@ -806,7 +821,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, delayed_item->data_len); btrfs_mark_buffer_dirty(leaf); - btrfs_delayed_item_release_metadata(fs_info, delayed_item); + btrfs_delayed_item_release_metadata(root, delayed_item); return 0; } @@ -858,7 +873,6 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf; struct btrfs_key key; @@ -908,7 +922,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, goto out; list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(fs_info, curr); + btrfs_delayed_item_release_metadata(root, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } @@ -1051,7 +1065,7 @@ out: no_iref: btrfs_release_path(path); err_out: - btrfs_delayed_inode_release_metadata(fs_info, node); + btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); return ret; @@ -1115,9 +1129,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, * Returns < 0 on error and returns with an aborted transaction with any * outstanding delayed items cleaned up. */ -static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int nr) +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; @@ -1162,16 +1176,14 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return ret; } -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans) { - return __btrfs_run_delayed_items(trans, fs_info, -1); + return __btrfs_run_delayed_items(trans, -1); } -int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int nr) +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) { - return __btrfs_run_delayed_items(trans, fs_info, nr); + return __btrfs_run_delayed_items(trans, nr); } int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, @@ -1443,7 +1455,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item); + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible @@ -1480,7 +1492,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, return 1; } - btrfs_delayed_item_release_metadata(fs_info, item); + btrfs_delayed_item_release_metadata(node->root, item); btrfs_release_delayed_item(item); mutex_unlock(&node->mutex); return 0; @@ -1515,7 +1527,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, item->key = item_key; - ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item); + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. @@ -1880,7 +1892,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(fs_info, curr_item); + btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); @@ -1888,7 +1900,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(fs_info, curr_item); + btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); @@ -1898,7 +1910,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_iref(delayed_node); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - btrfs_delayed_inode_release_metadata(fs_info, delayed_node); + btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); btrfs_release_delayed_inode(delayed_node); } mutex_unlock(&delayed_node->mutex); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index c4189d495934..100a91e26b55 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -111,10 +111,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode); -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int nr); +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans); +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr); void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info); @@ -151,7 +149,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, /* for init */ int __init btrfs_delayed_inode_init(void); -void btrfs_delayed_inode_exit(void); +void __cold btrfs_delayed_inode_exit(void); /* for debugging */ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 7ab5e0128f0c..2677257c149d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -216,7 +216,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - assert_spin_locked(&delayed_refs->lock); + lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return 0; @@ -239,7 +239,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - assert_spin_locked(&head->lock); + lockdep_assert_held(&head->lock); rb_erase(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) @@ -307,7 +307,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct rb_node *node; u64 seq = 0; - assert_spin_locked(&head->lock); + lockdep_assert_held(&head->lock); if (RB_EMPTY_ROOT(&head->ref_tree)) return; @@ -930,7 +930,7 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt return find_ref_head(&delayed_refs->href_root, bytenr, 0); } -void btrfs_delayed_ref_exit(void) +void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c4f625e5a691..9e3e5aff0937 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -204,7 +204,7 @@ extern struct kmem_cache *btrfs_delayed_data_ref_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); -void btrfs_delayed_ref_exit(void); +void __cold btrfs_delayed_ref_exit(void); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7efbc4d1128b..0d203633bb96 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -44,7 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device *tgtdev); -static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); static int btrfs_dev_replace_kthread(void *data); static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); @@ -174,8 +173,14 @@ no_valid_dev_replace_entry_found: } set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev_replace->tgtdev->dev_state); - btrfs_init_dev_replace_tgtdev_for_resume(fs_info, - dev_replace->tgtdev); + + WARN_ON(fs_info->fs_devices->rw_devices == 0); + dev_replace->tgtdev->io_width = fs_info->sectorsize; + dev_replace->tgtdev->io_align = fs_info->sectorsize; + dev_replace->tgtdev->sector_size = fs_info->sectorsize; + dev_replace->tgtdev->fs_info = fs_info; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &dev_replace->tgtdev->dev_state); } break; } @@ -200,13 +205,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace, 0); + btrfs_dev_replace_read_lock(dev_replace); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); return 0; } - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -264,7 +269,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -287,7 +292,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); btrfs_mark_buffer_dirty(eb); @@ -307,7 +312,7 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) static char* btrfs_dev_name(struct btrfs_device *device) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) return "<missing disk>"; else return rcu_str_deref(device->name); @@ -352,7 +357,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -390,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -402,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); goto leave; } @@ -426,7 +431,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } @@ -493,18 +498,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace, 0); + btrfs_dev_replace_read_lock(dev_replace); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); /* * flush all outstanding I/O and inode extent mappings before the @@ -529,7 +534,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -549,7 +554,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); @@ -586,7 +591,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); btrfs_rm_dev_replace_blocked(fs_info); @@ -679,7 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace, 0); + btrfs_dev_replace_read_lock(dev_replace); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -691,41 +696,36 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, args->status.num_uncorrectable_read_errors = atomic64_read(&dev_replace->num_uncorrectable_read_errors); args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); } -int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, - struct btrfs_ioctl_dev_replace_args *args) -{ - args->result = __btrfs_dev_replace_cancel(fs_info); - return 0; -} - -static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root = fs_info->tree_root; - u64 result; + int result; int ret; if (sb_rdonly(fs_info->sb)) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); goto leave; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; break; @@ -733,7 +733,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); @@ -743,6 +743,12 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) } ret = btrfs_commit_transaction(trans); WARN_ON(ret); + + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); @@ -756,7 +762,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -772,7 +778,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -782,12 +788,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_lock(dev_replace, 1); + btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -801,10 +807,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); return 0; } - btrfs_dev_replace_unlock(dev_replace, 1); + btrfs_dev_replace_write_unlock(dev_replace); WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); @@ -873,37 +879,37 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw) +void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) { - if (rw == 1) { - /* write */ -again: - wait_event(dev_replace->read_lock_wq, - atomic_read(&dev_replace->blocking_readers) == 0); - write_lock(&dev_replace->lock); - if (atomic_read(&dev_replace->blocking_readers)) { - write_unlock(&dev_replace->lock); - goto again; - } - } else { - read_lock(&dev_replace->lock); - atomic_inc(&dev_replace->read_locks); - } + read_lock(&dev_replace->lock); + atomic_inc(&dev_replace->read_locks); +} + +void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) +{ + ASSERT(atomic_read(&dev_replace->read_locks) > 0); + atomic_dec(&dev_replace->read_locks); + read_unlock(&dev_replace->lock); } -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw) +void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) { - if (rw == 1) { - /* write */ - ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); +again: + wait_event(dev_replace->read_lock_wq, + atomic_read(&dev_replace->blocking_readers) == 0); + write_lock(&dev_replace->lock); + if (atomic_read(&dev_replace->blocking_readers)) { write_unlock(&dev_replace->lock); - } else { - ASSERT(atomic_read(&dev_replace->read_locks) > 0); - atomic_dec(&dev_replace->read_locks); - read_unlock(&dev_replace->lock); + goto again; } } +void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) +{ + ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); + write_unlock(&dev_replace->lock); +} + /* inc blocking cnt and release read lock */ void btrfs_dev_replace_set_lock_blocking( struct btrfs_dev_replace *dev_replace) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index f94a76844ae7..8566a02ef222 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -32,13 +32,14 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, - struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw); -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw); +void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_clear_lock_blocking( struct btrfs_dev_replace *dev_replace); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index cbe421605cd5..29e967b2c667 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -18,7 +18,6 @@ #include "ctree.h" #include "disk-io.h" -#include "hash.h" #include "transaction.h" /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 21f34ad0d411..07b5e6f7df67 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -31,10 +31,10 @@ #include <linux/uuid.h> #include <linux/semaphore.h> #include <linux/error-injection.h> +#include <linux/crc32c.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" -#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void) return 0; } -void btrfs_end_io_wq_exit(void) +void __cold btrfs_end_io_wq_exit(void) { kmem_cache_destroy(btrfs_end_io_wq_cache); } @@ -124,8 +124,8 @@ struct async_submit_bio { void *private_data; struct btrfs_fs_info *fs_info; struct bio *bio; - extent_submit_bio_hook_t *submit_bio_start; - extent_submit_bio_hook_t *submit_bio_done; + extent_submit_bio_start_t *submit_bio_start; + extent_submit_bio_done_t *submit_bio_done; int mirror_num; unsigned long bio_flags; /* @@ -270,7 +270,7 @@ out: u32 btrfs_csum_data(const char *data, u32 seed, size_t len) { - return btrfs_crc32c(seed, data, len); + return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, u8 *result) @@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, if (csum_type == BTRFS_CSUM_TYPE_CRC32) { u32 crc = ~(u32)0; - const int csum_size = sizeof(crc); - char result[csum_size]; + char result[sizeof(crc)]; /* * The super_block structure does not span the whole @@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); - if (memcmp(raw_disk_sb, result, csum_size)) + if (memcmp(raw_disk_sb, result, sizeof(result))) ret = 1; } @@ -428,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return ret; } +static int verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key) +{ + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(1); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); +#endif + return -EIO; + } + + if (!first_key) + return 0; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + +#ifdef CONFIG_BTRFS_DEBUG + if (ret) { + WARN_ON(1); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)", + eb->start, first_key->objectid, first_key->type, + first_key->offset, found_key.objectid, + found_key.type, found_key.offset); + } +#endif + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. + * + * @parent_transid: expected transid, skip check if 0 + * @level: expected level, mandatory check + * @first_key: expected key of first slot, skip check if NULL */ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 parent_transid) + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct extent_io_tree *io_tree; int failed = 0; @@ -449,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { - if (!verify_parent_transid(io_tree, eb, + if (verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; - else ret = -EIO; + else if (verify_level_key(fs_info, eb, level, + first_key)) + ret = -EUCLEAN; + else + break; } /* @@ -461,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, * there is no reason to read the other copies, they won't be * any less wrong. */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || + ret == -EUCLEAN) break; num_copies = btrfs_num_copies(fs_info, @@ -602,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && btrfs_check_node(root, eb)) + if (found_level > 0 && btrfs_check_node(fs_info, eb)) ret = -EIO; if (!ret) @@ -710,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, return 0; } -unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) -{ - unsigned long limit = min_t(unsigned long, - info->thread_pool_size, - info->fs_devices->open_devices); - return 256 * limit; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -725,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->private_data, async->bio, - async->mirror_num, async->bio_flags, async->bio_offset); if (ret) async->status = ret; @@ -744,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->private_data, async->bio, async->mirror_num, - async->bio_flags, async->bio_offset); + async->submit_bio_done(async->private_data, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -759,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work) blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) + extent_submit_bio_start_t *submit_bio_start, + extent_submit_bio_done_t *submit_bio_done) { struct async_submit_bio *async; @@ -807,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, +static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { /* @@ -818,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio return btree_csum_one_bio(bio); } -static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num) { struct inode *inode = private_data; blk_status_t ret; @@ -879,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, bio_offset, private_data, - __btree_submit_bio_start, - __btree_submit_bio_done); + btree_submit_bio_start, + btree_submit_bio_done); } if (ret) @@ -1062,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) buf->start, buf->start + buf->len - 1); } +/* + * Read tree block at logical address @bytenr and do variant basic but critical + * verification. + * + * @parent_transid: expected transid of this tree block, skip check if 0 + * @level: expected level, mandatory check + * @first_key: expected key in slot 0, skip check if NULL + */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid) + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; @@ -1072,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid); + ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + level, first_key); if (ret) { free_extent_buffer(buf); return ERR_PTR(ret); @@ -1108,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -1160,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); + spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); @@ -1176,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); - atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1401,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_path *path; u64 generation; int ret; + int level; path = btrfs_alloc_path(); if (!path) @@ -1423,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); + level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), - generation); + generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); goto find_fail; @@ -1808,12 +1857,10 @@ sleep: if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))) btrfs_cleanup_transaction(fs_info); - set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || cannot_commit)) - schedule_timeout(delay); - __set_current_state(TASK_RUNNING); + schedule_timeout_interruptible(delay); } while (!kthread_should_stop()); return 0; } @@ -2183,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { - int max_active = fs_info->thread_pool_size; + u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = @@ -2276,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); + int level = btrfs_super_log_root_level(disk_super); if (fs_devices->rw_devices == 0) { btrfs_warn(fs_info, "log replay required on RO media"); @@ -2289,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(fs_info, bytenr, - fs_info->generation + 1); + fs_info->generation + 1, + level, NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2334,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.offset = 0; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; btrfs_init_devices_late(fs_info); location.objectid = BTRFS_CSUM_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->csum_root = root; @@ -2367,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (IS_ERR(root)) { ret = PTR_ERR(root); if (ret != -ENOENT) - return ret; + goto out; } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2376,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->free_space_root = root; } return 0; +out: + btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", + location.objectid, ret); + return ret; } int open_ctree(struct super_block *sb, @@ -2404,8 +2465,8 @@ int open_ctree(struct super_block *sb, int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - int max_active; int clear_free_space_tree = 0; + int level; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2447,6 +2508,8 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); + INIT_LIST_HEAD(&fs_info->pending_raid_kobjs); + spin_lock_init(&fs_info->pending_raid_kobjs_lock); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); @@ -2713,8 +2776,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - max_active = fs_info->thread_pool_size; - ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; @@ -2741,12 +2802,13 @@ int open_ctree(struct super_block *sb, } generation = btrfs_super_chunk_root_generation(disk_super); + level = btrfs_super_chunk_root_level(disk_super); __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), - generation); + generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { btrfs_err(fs_info, "failed to read chunk root"); @@ -2768,10 +2830,10 @@ int open_ctree(struct super_block *sb, } /* - * keep the device that is marked to be the target device for the - * dev_replace procedure + * Keep the devid that is marked to be the target device for the + * device replace procedure */ - btrfs_close_extra_devices(fs_devices, 0); + btrfs_free_extra_devids(fs_devices, 0); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); @@ -2780,10 +2842,11 @@ int open_ctree(struct super_block *sb, retry_root_backup: generation = btrfs_super_generation(disk_super); + level = btrfs_super_root_level(disk_super); tree_root->node = read_tree_block(fs_info, btrfs_super_root(disk_super), - generation); + generation, level, NULL); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { btrfs_warn(fs_info, "failed to read tree root"); @@ -2834,7 +2897,7 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_devices, 1); + btrfs_free_extra_devids(fs_devices, 1); ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { @@ -2953,6 +3016,7 @@ retry_root_backup: fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", err); goto fail_qgroup; } @@ -3290,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) struct buffer_head *bh; int i; int errors = 0; + bool primary_failed = false; u64 bytenr; if (max_mirrors == 0) @@ -3306,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) BTRFS_SUPER_INFO_SIZE); if (!bh) { errors++; + if (i == 0) + primary_failed = true; continue; } wait_on_buffer(bh); - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { errors++; + if (i == 0) + primary_failed = true; + } /* drop our reference */ brelse(bh); @@ -3319,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) brelse(bh); } + /* log error, force error return */ + if (primary_failed) { + btrfs_err(device->fs_info, "error writing primary super block to device %llu", + device->devid); + return -1; + } + return errors < i ? 0 : -1; } @@ -3851,7 +3928,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) * So here we should only check item pointers, not item data. */ if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(root, buf)) { + btrfs_check_leaf_relaxed(fs_info, buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -3890,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; - return btree_read_extent_buffer_pages(fs_info, buf, parent_transid); + return btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + level, first_key); } static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) @@ -4314,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); - if (!cache) { - btrfs_err(fs_info, "orphan block group dirty_bgs list"); - spin_unlock(&cur_trans->dirty_bgs_lock); - return; - } if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4338,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } spin_unlock(&cur_trans->dirty_bgs_lock); + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, struct btrfs_block_group_cache, io_list); - if (!cache) { - btrfs_err(fs_info, "orphan block group on io_bgs list"); - return; - } list_del_init(&cache->io_list); spin_lock(&cache->lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 301151a50ac1..453ea9f5d4e9 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,8 +52,9 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; -struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 parent_transid); +struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 parent_transid, int level, + struct btrfs_key *first_key); void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, int mirror_num, struct extent_buffer **eb); @@ -123,7 +124,8 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root) void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, + struct btrfs_key *first_key); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, @@ -131,9 +133,8 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done); -unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); + extent_submit_bio_start_t *submit_bio_start, + extent_submit_bio_done_t *submit_bio_done); int btrfs_write_tree_block(struct extent_buffer *buf); void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -154,7 +155,7 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, int create); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); -void btrfs_end_io_wq_exit(void); +void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c1618ab9fecf..e08d0d45af4f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -27,7 +27,7 @@ #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include <linux/lockdep.h> -#include "hash.h" +#include <linux/crc32c.h> #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work) struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; struct btrfs_caching_control *caching_ctl; - struct btrfs_root *extent_root; int ret; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - extent_root = fs_info->extent_root; mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); @@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, unsigned long nr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; @@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work) if (trans->transid > async->transid) goto end; - ret = btrfs_run_delayed_refs(trans, fs_info, async->count); + ret = btrfs_run_delayed_refs(trans, async->count); if (ret) async->error = ret; end: @@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, unsigned long count) + unsigned long count) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; @@ -3078,7 +3077,7 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif trans->can_flush_pending_bgs = false; - ret = __btrfs_run_delayed_refs(trans, fs_info, count); + ret = __btrfs_run_delayed_refs(trans, count); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; @@ -3086,7 +3085,7 @@ again: if (run_all) { if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); node = rb_first(&delayed_refs->href_root); @@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, * the commit latency by getting rid of the easy block groups while * we're still allowing others to join the commit. */ -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; @@ -3686,7 +3685,7 @@ again: * make sure all the block groups on our dirty list actually * exist */ - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); if (!path) { path = btrfs_alloc_path(); @@ -3741,8 +3740,9 @@ again: should_put = 0; /* - * the cache_write_mutex is protecting - * the io_list + * The cache_write_mutex is protecting the + * io_list, also refer to the definition of + * btrfs_transaction::io_bgs for more details */ list_add_tail(&cache->io_list, io); } else { @@ -3800,7 +3800,7 @@ again: * go through delayed refs for all the stuff we've just kicked off * and then loop back (just once) */ - ret = btrfs_run_delayed_refs(trans, fs_info, 0); + ret = btrfs_run_delayed_refs(trans, 0); if (!ret && loops == 0) { loops++; spin_lock(&cur_trans->dirty_bgs_lock); @@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, fs_info, + ret = btrfs_run_delayed_refs(trans, (unsigned long) -1); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { @@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } spin_unlock(&cur_trans->dirty_bgs_lock); + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, io_list); @@ -3990,7 +3994,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) bg = btrfs_lookup_block_group(fs_info, bytenr); ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) - wake_up_atomic_t(&bg->nocow_writers); + wake_up_var(&bg->nocow_writers); /* * Once for our lookup and once for the lookup done by a previous call * to btrfs_inc_nocow_writers() @@ -4001,8 +4005,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) { - wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } static const char *alloc_name(u64 flags) @@ -4333,8 +4336,7 @@ again: /* commit the current transaction and try again */ commit_trans: - if (need_commit && - !atomic_read(&fs_info->open_ioctl_trans)) { + if (need_commit) { need_commit--; if (need_commit > 0) { @@ -4542,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, * Needed because we can end up allocating a system chunk and for an * atomic and race free space reservation in the chunk block reserve. */ - ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); + lockdep_assert_held(&fs_info->chunk_mutex); info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); @@ -4603,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, return -ENOSPC; space_info = __find_space_info(fs_info, flags); - if (!space_info) { - ret = create_space_info(fs_info, flags, &space_info); - if (ret) - return ret; - } + ASSERT(space_info); again: spin_lock(&space_info->lock); @@ -4706,7 +4704,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4827,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, long time_left; unsigned long nr_pages; int loops; - enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); @@ -4868,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, atomic_read(&fs_info->async_delalloc_pages) <= (int)max_reclaim); skip_async: - if (!trans) - flush = BTRFS_RESERVE_FLUSH_ALL; - else - flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -4994,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); + ret = btrfs_run_delayed_items_nr(trans, nr); btrfs_end_transaction(trans); break; case FLUSH_DELALLOC: @@ -5389,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root, !block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } - if (ret == -ENOSPC) + if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } return ret; } @@ -5761,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, if (num_bytes == 0) return 0; + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + if (ret) + return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); @@ -5773,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. + * @qgroup_free - free or convert qgroup meta. + * Unlike normal operation, qgroup meta reservation needs to know if we are + * freeing qgroup reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. */ -static void btrfs_inode_rsv_release(struct btrfs_inode *inode) +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -5793,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode) if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, released); + else + btrfs_qgroup_convert_reserved_meta(inode->root, released); } void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, @@ -5893,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } -void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - if (!trans->block_rsv) { - ASSERT(!trans->bytes_reserved); - return; - } - - if (!trans->bytes_reserved) - return; - - ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, trans->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); - trans->bytes_reserved = 0; -} /* * To be called after all the new block groups attached to the transaction @@ -5952,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, */ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } @@ -5996,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { /* One for parent inode, two for dir entries */ num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret) return ret; } else { @@ -6015,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); if (ret && *qgroup_reserved) - btrfs_qgroup_free_meta(root, *qgroup_reserved); + btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved); return ret; } @@ -6052,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; @@ -6069,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; delalloc_lock = false; - } else if (current->journal_info) { - flush = BTRFS_RESERVE_FLUSH_LIMIT; - } + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; - if (flush != BTRFS_RESERVE_NO_FLUSH && - btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + } if (delalloc_lock) mutex_lock(&inode->delalloc_mutex); @@ -6090,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - ret = btrfs_qgroup_reserve_meta(root, - nr_extents * fs_info->nodesize, true); - if (ret) - goto out_fail; - } - ret = btrfs_inode_rsv_refill(inode, flush); - if (unlikely(ret)) { - btrfs_qgroup_free_meta(root, - nr_extents * fs_info->nodesize); + if (unlikely(ret)) goto out_fail; - } if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); @@ -6116,7 +6096,7 @@ out_fail: btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - btrfs_inode_rsv_release(inode); + btrfs_inode_rsv_release(inode, true); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -6126,12 +6106,14 @@ out_fail: * btrfs_delalloc_release_metadata - release a metadata reservation for an inode * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata * reservations, or on error for the same reason. */ -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); @@ -6144,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode); + btrfs_inode_rsv_release(inode, qgroup_free); } /** * btrfs_delalloc_release_extents - release our outstanding_extents * @inode: the inode to balance the reservation for. * @num_bytes: the number of bytes we originally reserved with + * @qgroup_free: do we need to free qgroup meta reservation or convert them. * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -6158,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * temporarily tracked outstanding_extents. This _must_ be used in conjunction * with btrfs_delalloc_reserve_metadata. */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); unsigned num_extents; @@ -6172,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode); + btrfs_inode_rsv_release(inode, qgroup_free); } /** @@ -6228,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode, */ void btrfs_delalloc_release_space(struct inode *inode, struct extent_changeset *reserved, - u64 start, u64 len) + u64 start, u64 len, bool qgroup_free) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); btrfs_free_reserved_data_space(inode, reserved, start, len); } @@ -6526,7 +6510,7 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, bg = btrfs_lookup_block_group(fs_info, start); ASSERT(bg); if (atomic_dec_and_test(&bg->reservations)) - wake_up_atomic_t(&bg->reservations); + wake_up_var(&bg->reservations); btrfs_put_block_group(bg); } @@ -6552,8 +6536,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) down_write(&space_info->groups_sem); up_write(&space_info->groups_sem); - wait_on_atomic_t(&bg->reservations, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); } /** @@ -6785,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; @@ -7353,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return ret; } -int __get_raid_index(u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ -} - -int get_block_group_index(struct btrfs_block_group_cache *cache) -{ - return __get_raid_index(cache->flags); -} - static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = "raid10", [BTRFS_RAID_RAID1] = "raid1", @@ -7490,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; - int index = __get_raid_index(flags); + int index = btrfs_bg_flags_to_raid_index(flags); bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7576,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - index = get_block_group_index(block_group); + index = btrfs_bg_flags_to_raid_index( + block_group->flags); btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } @@ -7586,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } search: have_caching_bg = false; - if (index == 0 || index == __get_raid_index(flags)) + if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], @@ -7844,7 +7805,8 @@ checks: loop: failed_cluster_refill = false; failed_alloc = false; - BUG_ON(index != get_block_group_index(block_group)); + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + index); btrfs_release_block_group(block_group, delalloc); cond_resched(); } @@ -7998,6 +7960,51 @@ again: up_read(&info->groups_sem); } +/* + * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a + * hole that is at least as big as @num_bytes. + * + * @root - The root that will contain this extent + * + * @ram_bytes - The amount of space in ram that @num_bytes take. This + * is used for accounting purposes. This value differs + * from @num_bytes only in the case of compressed extents. + * + * @num_bytes - Number of bytes to allocate on-disk. + * + * @min_alloc_size - Indicates the minimum amount of space that the + * allocator should try to satisfy. In some cases + * @num_bytes may be larger than what is required and if + * the filesystem is fragmented then allocation fails. + * However, the presence of @min_alloc_size gives a + * chance to try and satisfy the smaller allocation. + * + * @empty_size - A hint that you plan on doing more COW. This is the + * size in bytes the allocator should try to find free + * next to the block it returns. This is just a hint and + * may be ignored by the allocator. + * + * @hint_byte - Hint to the allocator to start searching above the byte + * address passed. It might be ignored. + * + * @ins - This key is modified to record the found hole. It will + * have the following values: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * + * @is_data - Boolean flag indicating whether an extent is + * allocated for data (true) or metadata (false) + * + * @delalloc - Boolean flag indicating whether this allocation is for + * delalloc or not. If 'true' data_rwsem of block groups + * is going to be acquired. + * + * + * Returns 0 when an allocation succeeded or < 0 when an error occurred. In + * case -ENOSPC is returned then @ins->offset will contain the size of the + * largest available hole the allocator managed to find. + */ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, @@ -8701,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 parent; u32 blocksize; struct btrfs_key key; + struct btrfs_key first_key; struct extent_buffer *next; int level = wc->level; int reada = 0; @@ -8721,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + btrfs_node_key_to_cpu(path->nodes[level], &first_key, + path->slots[level]); blocksize = fs_info->nodesize; next = find_extent_buffer(fs_info, bytenr); @@ -8785,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, generation); + next = read_tree_block(fs_info, bytenr, generation, level - 1, + &first_key); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -9650,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) */ target = get_restripe_target(fs_info, block_group->flags); if (target) { - index = __get_raid_index(extended_to_chunk(target)); + index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); } else { /* * this is just a balance, so if we were marked as full @@ -9664,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) goto out; } - index = get_block_group_index(block_group); + index = btrfs_bg_flags_to_raid_index(block_group->flags); } if (index == BTRFS_RAID_RAID10) { @@ -9913,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) return 0; } +/* link_block_group will queue up kobjects to add when we're reclaim-safe */ +void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + struct raid_kobject *rkobj; + LIST_HEAD(list); + int index; + int ret = 0; + + spin_lock(&fs_info->pending_raid_kobjs_lock); + list_splice_init(&fs_info->pending_raid_kobjs, &list); + spin_unlock(&fs_info->pending_raid_kobjs_lock); + + list_for_each_entry(rkobj, &list, list) { + space_info = __find_space_info(fs_info, rkobj->flags); + index = btrfs_bg_flags_to_raid_index(rkobj->flags); + + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + break; + } + } + if (ret) + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); +} + static void link_block_group(struct btrfs_block_group_cache *cache) { struct btrfs_space_info *space_info = cache->space_info; - int index = get_block_group_index(cache); + struct btrfs_fs_info *fs_info = cache->fs_info; + int index = btrfs_bg_flags_to_raid_index(cache->flags); bool first = false; down_write(&space_info->groups_sem); @@ -9926,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache) up_write(&space_info->groups_sem); if (first) { - struct raid_kobject *rkobj; - int ret; - - rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); - if (!rkobj) - goto out_err; - rkobj->raid_type = index; - kobject_init(&rkobj->kobj, &btrfs_raid_ktype); - ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", get_raid_name(index)); - if (ret) { - kobject_put(&rkobj->kobj); - goto out_err; + struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) { + btrfs_warn(cache->fs_info, + "couldn't alloc memory for raid level kobject"); + return; } + rkobj->flags = cache->flags; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + spin_lock(&fs_info->pending_raid_kobjs_lock); + list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); + spin_unlock(&fs_info->pending_raid_kobjs_lock); space_info->block_group_kobjs[index] = &rkobj->kobj; } - - return; -out_err: - btrfs_warn(cache->fs_info, - "failed to add kobject for block cache, ignoring"); } static struct btrfs_block_group_cache * @@ -10162,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) inc_block_group_ro(cache, 1); } + btrfs_add_raid_kobjects(info); init_global_block_rsv(info); ret = 0; error: @@ -10169,9 +10204,9 @@ error: return ret; } -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *block_group, *tmp; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; @@ -10256,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * with its ->space_info set. */ cache->space_info = __find_space_info(fs_info, cache->flags); - if (!cache->space_info) { - ret = create_space_info(fs_info, cache->flags, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; - } - } + ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(fs_info, cache); if (ret) { @@ -10336,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->key.offset); memcpy(&key, &block_group->key, sizeof(key)); - index = get_block_group_index(block_group); + index = btrfs_bg_flags_to_raid_index(block_group->flags); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -11061,7 +11088,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) ret = btrfs_start_write_no_snapshotting(root); if (ret) break; - wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&root->will_be_snapshotted, + !atomic_read(&root->will_be_snapshotted)); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dfeb74a0be77..47a8fe9d22e8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -76,8 +76,8 @@ void btrfs_leak_debug_check(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", - eb->start, eb->len, atomic_read(&eb->refs)); + pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -119,23 +119,22 @@ struct extent_page_data { unsigned int sync_io:1; }; -static void add_extent_changeset(struct extent_state *state, unsigned bits, +static int add_extent_changeset(struct extent_state *state, unsigned bits, struct extent_changeset *changeset, int set) { int ret; if (!changeset) - return; + return 0; if (set && (state->state & bits) == bits) - return; + return 0; if (!set && (state->state & bits) == 0) - return; + return 0; changeset->bytes_changed += state->end - state->start + 1; ret = ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); - /* ENOMEM */ - BUG_ON(ret < 0); + return ret; } static void flush_write_bio(struct extent_page_data *epd); @@ -187,7 +186,7 @@ free_state_cache: return -ENOMEM; } -void extent_io_exit(void) +void __cold extent_io_exit(void) { btrfs_leak_debug_check(); @@ -527,6 +526,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, { struct extent_state *next; unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; + int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -534,7 +534,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - add_extent_changeset(state, bits_to_clear, changeset, 0); + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); + BUG_ON(ret < 0); state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); @@ -805,13 +806,15 @@ static void set_state_bits(struct extent_io_tree *tree, unsigned *bits, struct extent_changeset *changeset) { unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; + int ret; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - add_extent_changeset(state, bits_to_set, changeset, 1); + ret = add_extent_changeset(state, bits_to_set, changeset, 1); + BUG_ON(ret < 0); state->state |= bits_to_set; } @@ -2744,20 +2747,21 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, return blk_status_to_errno(ret); } -static int merge_bio(struct extent_io_tree *tree, struct page *page, - unsigned long offset, size_t size, struct bio *bio, - unsigned long bio_flags) -{ - int ret = 0; - if (tree->ops) - ret = tree->ops->merge_bio_hook(page, offset, size, bio, - bio_flags); - return ret; - -} - /* * @opf: bio REQ_OP_* and REQ_* flags as one value + * @tree: tree so we can call our merge_bio hook + * @wbc: optional writeback control for io accounting + * @page: page to add to the bio + * @pg_offset: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @size: portion of page that we want to write + * @offset: starting offset in the page + * @bdev: attach newly created bios to this bdev + * @bio_ret: must be valid pointer, newly allocated bio will be stored there + * @end_io_func: end_io callback for new bio + * @mirror_num: desired mirror to read/write + * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @bio_flags: flags of the current bio to see if we can merge them */ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, @@ -2773,21 +2777,27 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, { int ret = 0; struct bio *bio; - int contig = 0; - int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; - if (bio_ret && *bio_ret) { + ASSERT(bio_ret); + + if (*bio_ret) { + bool contig; + bool can_merge = true; + bio = *bio_ret; - if (old_compressed) + if (prev_bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; - if (prev_bio_flags != bio_flags || !contig || + if (tree->ops && tree->ops->merge_bio_hook(page, offset, + page_size, bio, bio_flags)) + can_merge = false; + + if (prev_bio_flags != bio_flags || !contig || !can_merge || force_bio_submit || - merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, pg_offset) < page_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { @@ -2813,10 +2823,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, wbc_account_io(wbc, page, page_size); } - if (bio_ret) - *bio_ret = bio; - else - ret = submit_one_bio(bio, mirror_num, bio_flags); + *bio_ret = bio; return ret; } @@ -2886,8 +2893,7 @@ static int __do_readpage(struct extent_io_tree *tree, { struct inode *inode = page->mapping->host; u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; - u64 end; + const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; u64 last_byte = i_size_read(inode); @@ -2905,7 +2911,6 @@ static int __do_readpage(struct extent_io_tree *tree, set_page_extent_mapped(page); - end = page_end; if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); @@ -5230,11 +5235,6 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } -int extent_buffer_uptodate(struct extent_buffer *eb) -{ - return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -} - int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, int wait, int mirror_num) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a7a850abd600..b77d84909863 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -83,8 +83,8 @@ static inline int le_test_bit(int nr, const u8 *addr) return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); } -extern void le_bitmap_set(u8 *map, unsigned int start, int len); -extern void le_bitmap_clear(u8 *map, unsigned int start, int len); +void le_bitmap_set(u8 *map, unsigned int start, int len); +void le_bitmap_clear(u8 *map, unsigned int start, int len); struct extent_state; struct btrfs_root; @@ -95,6 +95,13 @@ struct io_failure_record; typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset); + +typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, + struct bio *bio, u64 bio_offset); + +typedef blk_status_t (extent_submit_bio_done_t)(void *private_data, + struct bio *bio, int mirror_num); + struct extent_io_ops { /* * The following callbacks must be allways defined, the function @@ -286,7 +293,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); -void extent_io_exit(void); +void __cold extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, @@ -455,6 +462,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb) atomic_inc(&eb->refs); } +static inline int extent_buffer_uptodate(struct extent_buffer *eb) +{ + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); +} + int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len); void read_extent_buffer(const struct extent_buffer *eb, void *dst, @@ -489,7 +501,6 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(const struct extent_buffer *eb, unsigned long offset, unsigned long min_len, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index d3bd02105d1c..53a0633c6ef7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -2,7 +2,6 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/hardirq.h> #include "ctree.h" #include "extent_map.h" #include "compression.h" @@ -20,7 +19,7 @@ int __init extent_map_init(void) return 0; } -void extent_map_exit(void) +void __cold extent_map_exit(void) { kmem_cache_destroy(extent_map_cache); } @@ -552,6 +551,9 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, ret = 0; existing = search_extent_mapping(em_tree, start, len); + + trace_btrfs_handle_em_exist(existing, em, start, len); + /* * existing will always be non-NULL, since there must be * extent causing the -EEXIST. diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b29f77bc0732..f6f8ba114977 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -86,7 +86,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); -void extent_map_exit(void); +void __cold extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 41ab9073d1d4..f247300170e5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1691,7 +1691,7 @@ again: force_page_uptodate); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); + reserve_bytes, true); break; } @@ -1703,7 +1703,7 @@ again: if (extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); + reserve_bytes, true); ret = extents_locked; break; } @@ -1738,7 +1738,7 @@ again: fs_info->sb->s_blocksize_bits; if (only_release_metadata) { btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes); + release_bytes, true); } else { u64 __pos; @@ -1747,7 +1747,7 @@ again: (dirty_pages << PAGE_SHIFT); btrfs_delalloc_release_space(inode, data_reserved, __pos, - release_bytes); + release_bytes, true); } } @@ -1760,7 +1760,8 @@ again: if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, + (ret != 0)); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -1800,11 +1801,11 @@ again: if (only_release_metadata) { btrfs_end_write_no_snapshotting(root); btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes); + release_bytes, true); } else { btrfs_delalloc_release_space(inode, data_reserved, round_down(pos, fs_info->sectorsize), - release_bytes); + release_bytes, true); } } @@ -1997,8 +1998,6 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; - if (private && private->trans) - btrfs_ioctl_trans_end(filp); if (private && private->filldir_buf) kfree(private->filldir_buf); kfree(private); @@ -2190,12 +2189,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* - * ok we haven't committed the transaction yet, lets do a commit - */ - if (file->private_data) - btrfs_ioctl_trans_end(file); - - /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for * example checking cross references in the nocow path). If we use join @@ -2214,7 +2207,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2482,7 +2475,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, if ((!ordered || (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + !filemap_range_has_page(inode->i_mapping, + lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -3378,7 +3372,7 @@ const struct file_operations btrfs_file_operations = { .dedupe_file_range = btrfs_dedupe_file_range, }; -void btrfs_auto_defrag_exit(void) +void __cold btrfs_auto_defrag_exit(void) { kmem_cache_destroy(btrfs_inode_defrag_cachep); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a9f22ac50d6a..d0dde9e6afd7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3547,7 +3547,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (ret) { if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), - inode->i_size); + inode->i_size, true); #ifdef DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index fe5e0324dca9..af36a6a971fe 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1071,7 +1071,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); if (!path2) { @@ -1573,7 +1573,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; info = search_free_space_info(NULL, fs_info, block_group, path, 0); if (IS_ERR(info)) { diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c deleted file mode 100644 index baacc1866861..000000000000 --- a/fs/btrfs/hash.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#include <crypto/hash.h> -#include <linux/err.h> -#include "hash.h" - -static struct crypto_shash *tfm; - -int __init btrfs_hash_init(void) -{ - tfm = crypto_alloc_shash("crc32c", 0, 0); - - return PTR_ERR_OR_ZERO(tfm); -} - -const char* btrfs_crc32c_impl(void) -{ - return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); -} - -void btrfs_hash_exit(void) -{ - crypto_free_shash(tfm); -} - -u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) -{ - SHASH_DESC_ON_STACK(shash, tfm); - u32 *ctx = (u32 *)shash_desc_ctx(shash); - u32 retval; - int err; - - shash->tfm = tfm; - shash->flags = 0; - *ctx = crc; - - err = crypto_shash_update(shash, address, length); - BUG_ON(err); - - retval = *ctx; - barrier_data(ctx); - return retval; -} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h deleted file mode 100644 index c3a2ec554361..000000000000 --- a/fs/btrfs/hash.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __HASH__ -#define __HASH__ - -int __init btrfs_hash_init(void); - -void btrfs_hash_exit(void); -const char* btrfs_crc32c_impl(void); - -u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); - -static inline u64 btrfs_name_hash(const char *name, int len) -{ - return btrfs_crc32c((u32)~1, name, len); -} - -/* - * Figure the key offset of an extended inode ref - */ -static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, - int len) -{ - return (u64) btrfs_crc32c(parent_objectid, name, len); -} - -#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 39c968f80157..1d5631ef2738 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -18,14 +18,13 @@ #include "ctree.h" #include "disk-io.h" -#include "hash.h" #include "transaction.h" #include "print-tree.h" -static int find_name_in_backref(struct btrfs_path *path, const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) +int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, + const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) { - struct extent_buffer *leaf; struct btrfs_inode_ref *ref; unsigned long ptr; unsigned long name_ptr; @@ -33,9 +32,8 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, u32 cur_offset = 0; int len; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); len = btrfs_inode_ref_name_len(leaf, ref); @@ -44,18 +42,19 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, if (len != name_len) continue; if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - *ref_ret = ref; + if (ref_ret) + *ref_ret = ref; return 1; } } return 0; } -int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, +int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, + u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret) { - struct extent_buffer *leaf; struct btrfs_inode_extref *extref; unsigned long ptr; unsigned long name_ptr; @@ -63,9 +62,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, u32 cur_offset = 0; int ref_name_len; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); /* * Search all extended backrefs in this item. We're only @@ -113,7 +111,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return NULL; - if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len, + &extref)) return NULL; return extref; } @@ -155,7 +155,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * This should always succeed so error here will make the FS * readonly. */ - if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len, &extref)) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -225,7 +226,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, } else if (ret < 0) { goto out; } - if (!find_name_in_backref(path, name, name_len, &ref)) { + if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len, &ref)) { ret = -ENOENT; search_ext_refs = 1; goto out; @@ -293,7 +295,9 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { - if (btrfs_find_name_in_ext_backref(path, ref_objectid, + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name, name_len, NULL)) goto out; @@ -351,7 +355,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; - if (find_name_in_backref(path, name, name_len, &ref)) + if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len, &ref)) goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -365,7 +370,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ret = 0; } else if (ret < 0) { if (ret == -EOVERFLOW) { - if (find_name_in_backref(path, name, name_len, &ref)) + if (btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], + name, name_len, &ref)) ret = -EEXIST; else ret = -EMLINK; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 022b19336fee..9409dcc7020d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -500,12 +500,12 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a79299a89b7d..1f091c2358a4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -58,7 +58,6 @@ #include "free-space-cache.h" #include "inode-map.h" #include "backref.h" -#include "hash.h" #include "props.h" #include "qgroup.h" #include "dedupe.h" @@ -102,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { }; static int btrfs_setsize(struct inode *inode, struct iattr *attr); -static int btrfs_truncate(struct inode *inode); +static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, @@ -277,12 +276,12 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_root *root, - struct inode *inode, u64 start, +static noinline int cow_file_range_inline(struct inode *inode, u64 start, u64 end, size_t compressed_size, int compress_type, struct page **compressed_pages) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 isize = i_size_read(inode); @@ -458,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode, int *num_added) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; u64 blocksize = fs_info->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -580,11 +578,11 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(root, inode, start, end, - 0, BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(inode, start, end, 0, + BTRFS_COMPRESS_NONE, NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(root, inode, start, end, + ret = cow_file_range_inline(inode, start, end, total_compressed, compress_type, pages); } @@ -961,7 +959,6 @@ static noinline int cow_file_range(struct inode *inode, u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; - u64 disk_num_bytes; u64 cur_alloc_size = 0; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; @@ -979,14 +976,14 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; + ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(root, inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(inode, start, end, 0, + BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the @@ -1010,15 +1007,12 @@ static noinline int cow_file_range(struct inode *inode, } } - BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(fs_info->super_copy)); - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(BTRFS_I(inode), start, start + num_bytes - 1, 0); - while (disk_num_bytes > 0) { - cur_alloc_size = disk_num_bytes; + while (num_bytes > 0) { + cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); @@ -1082,11 +1076,10 @@ static noinline int cow_file_range(struct inode *inode, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); - if (disk_num_bytes < cur_alloc_size) - disk_num_bytes = 0; + if (num_bytes < cur_alloc_size) + num_bytes = 0; else - disk_num_bytes -= cur_alloc_size; - num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; extent_reserved = false; @@ -1262,6 +1255,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, list_del(&sums->list); kfree(sums); } + if (ret < 0) + return ret; return 1; } @@ -1394,10 +1389,23 @@ next_slot: goto out_check; if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; - if (btrfs_cross_ref_exist(root, ino, - found_key.offset - - extent_offset, disk_bytenr)) + ret = btrfs_cross_ref_exist(root, ino, + found_key.offset - + extent_offset, disk_bytenr); + if (ret) { + /* + * ret could be -EIO if the above fails to read + * metadata. + */ + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; + goto error; + } + + WARN_ON_ONCE(nolock); goto out_check; + } disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; @@ -1415,10 +1423,22 @@ next_slot: * this ensure that csum for a given extent are * either valid or do not exist. */ - if (csum_exist_in_range(fs_info, disk_bytenr, - num_bytes)) { + ret = csum_exist_in_range(fs_info, disk_bytenr, + num_bytes); + if (ret) { if (!nolock) btrfs_end_write_no_snapshotting(root); + + /* + * ret could be -EIO if the above fails to read + * metadata. + */ + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; + goto error; + } + WARN_ON_ONCE(nolock); goto out_check; } if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { @@ -1847,7 +1867,7 @@ static void btrfs_clear_bit_hook(void *private_data, */ if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len); + btrfs_delalloc_release_metadata(inode, len, false); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) @@ -1921,8 +1941,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, +static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { struct inode *inode = private_data; @@ -1941,9 +1960,8 @@ static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num) { struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2015,8 +2033,8 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, bio_offset, inode, - __btrfs_submit_bio_start, - __btrfs_submit_bio_done); + btrfs_submit_bio_start, + btrfs_submit_bio_done); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, 0, 0); @@ -2043,12 +2061,15 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, struct list_head *list) { struct btrfs_ordered_sum *sum; + int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - btrfs_csum_file_blocks(trans, + ret = btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); trans->adding_csums = false; + if (ret) + return ret; } return 0; } @@ -2131,7 +2152,7 @@ again: ClearPageChecked(page); set_page_dirty(page); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -2751,12 +2772,10 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; struct inode *inode; - struct btrfs_root *root; struct rb_node *node; int ret; inode = new->inode; - root = BTRFS_I(inode)->root; path = btrfs_alloc_path(); if (!path) @@ -3062,7 +3081,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, inode, &ordered_extent->list); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); @@ -3240,6 +3263,16 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } +/* + * btrfs_add_delayed_iput - perform a delayed iput on @inode + * + * @inode: The inode we want to perform iput on + * + * This function uses the generic vfs_inode::i_count to track whether we should + * just decrement it (in case it's > 1) or if this is the last iput then link + * the inode to the delayed iput machinery. Delayed iputs are processed at + * transaction commit time/superblock commit/cleaner kthread. + */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3249,12 +3282,8 @@ void btrfs_add_delayed_iput(struct inode *inode) return; spin_lock(&fs_info->delayed_iput_lock); - if (binode->delayed_iput_count == 0) { - ASSERT(list_empty(&binode->delayed_iput)); - list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); - } else { - binode->delayed_iput_count++; - } + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); } @@ -3267,13 +3296,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); - if (inode->delayed_iput_count) { - inode->delayed_iput_count--; - list_move_tail(&inode->delayed_iput, - &fs_info->delayed_iputs); - } else { - list_del_init(&inode->delayed_iput); - } + list_del_init(&inode->delayed_iput); spin_unlock(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); spin_lock(&fs_info->delayed_iput_lock); @@ -3343,7 +3366,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = NULL; int reserve = 0; - int insert = 0; + bool insert = false; int ret; if (!root->orphan_block_rsv) { @@ -3353,7 +3376,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, return -ENOMEM; } + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &inode->runtime_flags)) + insert = true; + + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &inode->runtime_flags)) + reserve = 1; + spin_lock(&root->orphan_lock); + /* If someone has created ->orphan_block_rsv, be happy to use it. */ if (!root->orphan_block_rsv) { root->orphan_block_rsv = block_rsv; } else if (block_rsv) { @@ -3361,26 +3393,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, block_rsv = NULL; } - if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags)) { -#if 0 - /* - * For proper ENOSPC handling, we should do orphan - * cleanup when mounting. But this introduces backward - * compatibility issue. - */ - if (!xchg(&root->orphan_item_inserted, 1)) - insert = 2; - else - insert = 1; -#endif - insert = 1; + if (insert) atomic_inc(&root->orphan_inodes); - } - - if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags)) - reserve = 1; spin_unlock(&root->orphan_lock); /* grab metadata reservation from transaction handle */ @@ -3404,7 +3418,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, } /* insert an orphan item to track this unlinked/truncated file */ - if (insert >= 1) { + if (insert) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret) { if (reserve) { @@ -3428,15 +3442,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, ret = 0; } - /* insert an orphan item to track subvolume contains orphan files */ - if (insert >= 2) { - ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, - root->root_key.objectid); - if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, ret); - return ret; - } - } return 0; } @@ -3637,7 +3642,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) goto out; } - ret = btrfs_truncate(inode); + ret = btrfs_truncate(inode, false); if (ret) btrfs_orphan_del(NULL, BTRFS_I(inode)); } else { @@ -4704,7 +4709,6 @@ delete: if (updates) { trans->delayed_ref_updates = 0; ret = btrfs_run_delayed_refs(trans, - fs_info, updates * 2); if (ret && !err) err = ret; @@ -4744,8 +4748,7 @@ error: unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, fs_info, - updates * 2); + ret = btrfs_run_delayed_refs(trans, updates * 2); if (ret && !err) err = ret; } @@ -4799,8 +4802,8 @@ again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, - block_start, blocksize); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); + block_start, blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); ret = -ENOMEM; goto out; } @@ -4867,8 +4870,8 @@ again: out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); + blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); unlock_page(page); put_page(page); out: @@ -5123,7 +5126,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode_dio_wait(inode); btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); - ret = btrfs_truncate(inode); + ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { int err; @@ -5459,7 +5462,8 @@ no_delete: /* * this returns the key found in the dir entry in the location pointer. - * If no dir entries were found, location->objectid is 0. + * If no dir entries were found, returns -ENOENT. + * If found a corrupted location in dir entry, returns -EUCLEAN. */ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location) @@ -5477,27 +5481,27 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), name, namelen, 0); - if (IS_ERR(di)) + if (!di) { + ret = -ENOENT; + goto out; + } + if (IS_ERR(di)) { ret = PTR_ERR(di); - - if (IS_ERR_OR_NULL(di)) - goto out_err; + goto out; + } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); if (location->type != BTRFS_INODE_ITEM_KEY && location->type != BTRFS_ROOT_ITEM_KEY) { + ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", __func__, name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); - goto out_err; } out: btrfs_free_path(path); return ret; -out_err: - location->objectid = 0; - goto out; } /* @@ -5800,9 +5804,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret < 0) return ERR_PTR(ret); - if (location.objectid == 0) - return ERR_PTR(-ENOENT); - if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); return inode; @@ -7436,76 +7437,6 @@ out: return ret; } -bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) -{ - struct radix_tree_root *root = &inode->i_mapping->page_tree; - bool found = false; - void **pagep = NULL; - struct page *page = NULL; - unsigned long start_idx; - unsigned long end_idx; - - start_idx = start >> PAGE_SHIFT; - - /* - * end is the last byte in the last page. end == start is legal - */ - end_idx = end >> PAGE_SHIFT; - - rcu_read_lock(); - - /* Most of the code in this while loop is lifted from - * find_get_page. It's been modified to begin searching from a - * page and return just the first page found in that range. If the - * found idx is less than or equal to the end idx then we know that - * a page exists. If no pages are found or if those pages are - * outside of the range then we're fine (yay!) */ - while (page == NULL && - radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - page = NULL; - continue; - } - /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. - */ - page = NULL; - break; /* TODO: Is this relevant for this use case? */ - } - - if (!page_cache_get_speculative(page)) { - page = NULL; - continue; - } - - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(page); - page = NULL; - } - } - - if (page) { - if (page->index <= end_idx) - found = true; - put_page(page); - } - - rcu_read_unlock(); - return found; -} - static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, int writing) { @@ -7531,8 +7462,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * get stale data. */ if (!ordered && - (!writing || - !btrfs_page_exists_in_range(inode, lockstart, lockend))) + (!writing || !filemap_range_has_page(inode->i_mapping, + lockstart, lockend))) break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -8263,9 +8194,8 @@ static void btrfs_endio_direct_write(struct bio *bio) bio_put(bio); } -static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 offset) +static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, + struct bio *bio, u64 offset) { struct inode *inode = private_data; blk_status_t ret; @@ -8291,13 +8221,13 @@ static void btrfs_end_dio_bio(struct bio *bio) err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); if (err) { - dip->errors = 1; - /* - * before atomic variable goto zero, we must make sure - * dip->errors is perceived to be set. + * We want to perceive the errors flag being set before + * decrementing the reference count. We don't need a barrier + * since atomic operations with a return value are fully + * ordered as per atomic_t.txt */ - smp_mb__before_atomic(); + dip->errors = 1; } /* if there are more bios still pending for this dio, just exit */ @@ -8345,9 +8275,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, return 0; } -static inline blk_status_t -__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, - int async_submit) +static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, + struct inode *inode, u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; @@ -8370,8 +8299,8 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, if (write && async_submit) { ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, file_offset, inode, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); + btrfs_submit_bio_start_direct_io, + btrfs_submit_bio_done); goto err; } else if (write) { /* @@ -8457,7 +8386,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) */ atomic_inc(&dip->pending_bios); - status = __btrfs_submit_dio_bio(bio, inode, file_offset, + status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (status) { bio_put(bio); @@ -8477,7 +8406,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) } while (submit_len > 0); submit: - status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); + status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (!status) return 0; @@ -8485,10 +8414,11 @@ submit: out_err: dip->errors = 1; /* - * before atomic variable goto zero, we must - * make sure dip->errors is perceived to be set. + * Before atomic variable goto zero, we must make sure dip->errors is + * perceived to be set. This ordering is ensured by the fact that an + * atomic operations with a return value are fully ordered as per + * atomic_t.txt */ - smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -8706,7 +8636,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) btrfs_delalloc_release_space(inode, data_reserved, - offset, dio_data.reserve); + offset, dio_data.reserve, true); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8722,8 +8652,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) false); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, - offset, count - (size_t)ret); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); + offset, count - (size_t)ret, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), count, false); } out: if (wakeup) @@ -9038,7 +8968,8 @@ again: if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE - reserved_space); + page_start, PAGE_SIZE - reserved_space, + true); } } @@ -9088,23 +9019,23 @@ again: out_unlock: if (!ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); btrfs_delalloc_release_space(inode, data_reserved, page_start, - reserved_space); + reserved_space, (ret != 0)); out_noreserve: sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return ret; } -static int btrfs_truncate(struct inode *inode) +static int btrfs_truncate(struct inode *inode, bool skip_writeback) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -9115,10 +9046,12 @@ static int btrfs_truncate(struct inode *inode) u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); - ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), - (u64)-1); - if (ret) - return ret; + if (!skip_writeback) { + ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + (u64)-1); + if (ret) + return ret; + } /* * Yes ladies and gentlemen, this is indeed ugly. The fact is we have @@ -9328,7 +9261,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; - ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -9448,7 +9380,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void btrfs_destroy_cachep(void) +void __cold btrfs_destroy_cachep(void) { /* * Make sure all delayed rcu free inodes are flushed before we diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 111ee282b777..b2db3988813f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -106,7 +106,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags) { if (S_ISDIR(mode)) return flags; @@ -723,7 +723,7 @@ fail: btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshotted)) - wake_up_atomic_t(&root->will_be_snapshotted); + wake_up_var(&root->will_be_snapshotted); free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); @@ -1197,7 +1197,7 @@ again: spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, - (page_cnt - i_done) << PAGE_SHIFT); + (page_cnt - i_done) << PAGE_SHIFT, true); } @@ -1215,7 +1215,8 @@ again: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, + false); extent_changeset_free(data_reserved); return i_done; out: @@ -1225,8 +1226,9 @@ out: } btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); + page_cnt << PAGE_SHIFT, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, + true); extent_changeset_free(data_reserved); return ret; @@ -2600,7 +2602,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) range->len = (u64)-1; } ret = btrfs_defrag_file(file_inode(file), file, - range, 0, 0); + range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; kfree(range); @@ -3936,73 +3938,6 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off, return btrfs_clone_files(dst_file, src_file, off, len, destoff); } -/* - * there are many ways the trans_start and trans_end ioctls can lead - * to deadlocks. They should only be used by applications that - * basically own the machine, and have a very in depth understanding - * of all the possible deadlocks and enospc problems. - */ -static long btrfs_ioctl_trans_start(struct file *file) -{ - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - struct btrfs_file_private *private; - int ret; - static bool warned = false; - - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - - if (!warned) { - btrfs_warn(fs_info, - "Userspace transaction mechanism is considered " - "deprecated and slated to be removed in 4.17. " - "If you have a valid use case please " - "speak up on the mailing list"); - WARN_ON(1); - warned = true; - } - - ret = -EINPROGRESS; - private = file->private_data; - if (private && private->trans) - goto out; - if (!private) { - private = kzalloc(sizeof(struct btrfs_file_private), - GFP_KERNEL); - if (!private) - return -ENOMEM; - file->private_data = private; - } - - ret = -EROFS; - if (btrfs_root_readonly(root)) - goto out; - - ret = mnt_want_write_file(file); - if (ret) - goto out; - - atomic_inc(&fs_info->open_ioctl_trans); - - ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root); - if (IS_ERR(trans)) - goto out_drop; - - private->trans = trans; - return 0; - -out_drop: - atomic_dec(&fs_info->open_ioctl_trans); - mnt_drop_write_file(file); -out: - return ret; -} - static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); @@ -4244,30 +4179,6 @@ out: return ret; } -/* - * there are many ways the trans_start and trans_end ioctls can lead - * to deadlocks. They should only be used by applications that - * basically own the machine, and have a very in depth understanding - * of all the possible deadlocks and enospc problems. - */ -long btrfs_ioctl_trans_end(struct file *file) -{ - struct inode *inode = file_inode(file); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_private *private = file->private_data; - - if (!private || !private->trans) - return -EINVAL; - - btrfs_end_transaction(private->trans); - private->trans = NULL; - - atomic_dec(&root->fs_info->open_ioctl_trans); - - mnt_drop_write_file(file); - return 0; -} - static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, void __user *argp) { @@ -4429,7 +4340,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = 0; break; case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: - ret = btrfs_dev_replace_cancel(fs_info, p); + p->result = btrfs_dev_replace_cancel(fs_info); + ret = 0; break; default: ret = -EINVAL; @@ -5138,10 +5050,17 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); if (received_uuid_changed && - !btrfs_is_empty_uuid(root_item->received_uuid)) - btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + !btrfs_is_empty_uuid(root_item->received_uuid)) { + ret = btrfs_uuid_tree_rem(trans, fs_info, + root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); btrfs_set_root_stransid(root_item, sa->stransid); btrfs_set_root_rtransid(root_item, sa->rtransid); @@ -5574,10 +5493,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); - case BTRFS_IOC_TRANS_START: - return btrfs_ioctl_trans_start(file); - case BTRFS_IOC_TRANS_END: - return btrfs_ioctl_trans_end(file); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(file, argp); case BTRFS_IOC_TREE_SEARCH_V2: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d13128c70ddd..621083f8932c 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -290,7 +290,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) /* * Make sure counter is updated before we wake up waiters. */ - smp_mb(); + smp_mb__after_atomic(); if (waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); } else { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 6c7f18cd3b61..1c7f7f70caf4 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -382,14 +382,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; - size_t tot_len; int ret = 0; char *kaddr; unsigned long bytes; BUG_ON(srclen < LZO_LEN); - tot_len = read_compress_length(data_in); data_in += LZO_LEN; in_len = read_compress_length(data_in); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5b311aeddcc8..661cc3db0c7c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -610,7 +610,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->len); + btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); @@ -1154,7 +1154,7 @@ int __init ordered_data_init(void) return 0; } -void ordered_data_exit(void) +void __cold ordered_data_exit(void) { kmem_cache_destroy(btrfs_ordered_extent_cache); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 56c4c0ee6381..4a1672a13ba6 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -151,7 +151,9 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); + + return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size; } static inline void @@ -215,5 +217,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); -void ordered_data_exit(void); +void __cold ordered_data_exit(void); #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 569205e651c7..4a8770485f77 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -365,9 +365,13 @@ void btrfs_print_tree(struct extent_buffer *c) btrfs_node_blockptr(c, i)); } for (i = 0; i < nr; i++) { - struct extent_buffer *next = read_tree_block(fs_info, - btrfs_node_blockptr(c, i), - btrfs_node_ptr_generation(c, i)); + struct btrfs_key first_key; + struct extent_buffer *next; + + btrfs_node_key_to_cpu(c, &first_key, i); + next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i), + level - 1, &first_key); if (IS_ERR(next)) { continue; } else if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b30a056963ab..5859f7d3cf3e 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -19,8 +19,8 @@ #include <linux/hashtable.h> #include "props.h" #include "btrfs_inode.h" -#include "hash.h" #include "transaction.h" +#include "ctree.h" #include "xattr.h" #include "compression.h" @@ -116,7 +116,7 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, return -EINVAL; if (value_len == 0) { - ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + ret = btrfs_setxattr(trans, inode, handler->xattr_name, NULL, 0, flags); if (ret) return ret; @@ -130,13 +130,13 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, ret = handler->validate(value, value_len); if (ret) return ret; - ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + ret = btrfs_setxattr(trans, inode, handler->xattr_name, value, value_len, flags); if (ret) return ret; ret = handler->apply(inode, value, value_len); if (ret) { - __btrfs_setxattr(trans, inode, handler->xattr_name, + btrfs_setxattr(trans, inode, handler->xattr_name, NULL, 0, flags); return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aa259d6986e1..f583f13ff26e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,6 +47,82 @@ * - check all ioctl parameters */ +/* + * Helpers to access qgroup reservation + * + * Callers should ensure the lock context and type are valid + */ + +static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) +{ + u64 ret = 0; + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + ret += qgroup->rsv.values[i]; + + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) +{ + if (type == BTRFS_QGROUP_RSV_DATA) + return "data"; + if (type == BTRFS_QGROUP_RSV_META_PERTRANS) + return "meta_pertrans"; + if (type == BTRFS_QGROUP_RSV_META_PREALLOC) + return "meta_prealloc"; + return NULL; +} +#endif + +static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + qgroup->rsv.values[type] += num_bytes; +} + +static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + if (qgroup->rsv.values[type] >= num_bytes) { + qgroup->rsv.values[type] -= num_bytes; + return; + } +#ifdef CONFIG_BTRFS_DEBUG + WARN_RATELIMIT(1, + "qgroup %llu %s reserved space underflow, have %llu to free %llu", + qgroup->qgroupid, qgroup_rsv_type_str(type), + qgroup->rsv.values[type], num_bytes); +#endif + qgroup->rsv.values[type] = 0; +} + +static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *dest, + struct btrfs_qgroup *src) +{ + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); +} + +static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *dest, + struct btrfs_qgroup *src) +{ + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); +} + static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -826,10 +902,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, int slot; mutex_lock(&fs_info->qgroup_ioctl_lock); - if (fs_info->quota_root) { - set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags); + if (fs_info->quota_root) goto out; - } fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); if (!fs_info->qgroup_ulist) { @@ -923,8 +997,15 @@ out_add_root: } spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags); + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } + out_free_path: btrfs_free_path(path); out_free_root: @@ -991,33 +1072,29 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } -static void report_reserved_underflow(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup, - u64 num_bytes) -{ -#ifdef CONFIG_BTRFS_DEBUG - WARN_ON(qgroup->reserved < num_bytes); - btrfs_debug(fs_info, - "qgroup %llu reserved space underflow, have: %llu, to free: %llu", - qgroup->qgroupid, qgroup->reserved, num_bytes); -#endif - qgroup->reserved = 0; -} /* - * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their reference and - * exclusive counts adjusted. + * The easy accounting, we're updating qgroup relationship whose child qgroup + * only has exclusive extents. + * + * In this case, all exclsuive extents will also be exlusive for parent, so + * excl/rfer just get added/removed. + * + * So is qgroup reservation space, which should also be added/removed to + * parent. + * Or when child tries to release reservation space, parent will underflow its + * reservation (for relationship adding case). * * Caller should hold fs_info->qgroup_lock. */ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, struct ulist *tmp, u64 ref_root, - u64 num_bytes, int sign) + struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *glist; struct ulist_node *unode; struct ulist_iterator uiter; + u64 num_bytes = src->excl; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1030,13 +1107,11 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; - if (sign > 0) { - trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); - if (qgroup->reserved < num_bytes) - report_reserved_underflow(fs_info, qgroup, num_bytes); - else - qgroup->reserved -= num_bytes; - } + + if (sign > 0) + qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); + else + qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); qgroup_dirty(fs_info, qgroup); @@ -1056,15 +1131,10 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; - if (sign > 0) { - trace_qgroup_update_reserve(fs_info, qgroup, - -(s64)num_bytes); - if (qgroup->reserved < num_bytes) - report_reserved_underflow(fs_info, qgroup, - num_bytes); - else - qgroup->reserved -= num_bytes; - } + if (sign > 0) + qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); + else + qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); @@ -1107,7 +1177,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, if (qgroup->excl == qgroup->rfer) { ret = 0; err = __qgroup_excl_accounting(fs_info, tmp, dst, - qgroup->excl, sign); + qgroup, sign); if (err < 0) { ret = err; goto out; @@ -1414,7 +1484,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; - assert_spin_locked(&delayed_refs->lock); + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); while (*p) { @@ -1614,7 +1684,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, return 0; if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen); + ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); if (ret) goto out; } @@ -1645,6 +1715,7 @@ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { + struct btrfs_key first_key; int parent_slot; u64 child_gen; u64 child_bytenr; @@ -1657,8 +1728,10 @@ walk_down: parent_slot = path->slots[level + 1]; child_bytenr = btrfs_node_blockptr(eb, parent_slot); child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); - eb = read_tree_block(fs_info, child_bytenr, child_gen); + eb = read_tree_block(fs_info, child_bytenr, child_gen, + level, &first_key); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; @@ -2009,9 +2082,9 @@ out_free: return ret; } -int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; @@ -2080,17 +2153,9 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; - int start_rescan_worker = 0; if (!quota_root) - goto out; - - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - test_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags)) - start_rescan_worker = 1; - - if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags)) - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + return ret; spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { @@ -2119,18 +2184,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - if (!ret && start_rescan_worker) { - ret = qgroup_rescan_init(fs_info, 0, 1); - if (!ret) { - qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_work(fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); - } - ret = 0; - } - -out: - return ret; } @@ -2338,24 +2391,24 @@ out: static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && - qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer) + qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && - qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl) + qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; return true; } -static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, + enum btrfs_qgroup_rsv_type type) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - int retried = 0; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2369,7 +2422,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) capable(CAP_SYS_RESOURCE)) enforce = false; -retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; if (!quota_root) @@ -2385,7 +2437,7 @@ retry: */ ulist_reinit(fs_info->qgroup_ulist); ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - (uintptr_t)qgroup, GFP_ATOMIC); + qgroup_to_aux(qgroup), GFP_ATOMIC); if (ret < 0) goto out; ULIST_ITER_INIT(&uiter); @@ -2396,27 +2448,6 @@ retry: qg = unode_aux_to_qgroup(unode); if (enforce && !qgroup_check_limits(qg, num_bytes)) { - /* - * Commit the tree and retry, since we may have - * deletions which would free up space. - */ - if (!retried && qg->reserved > 0) { - struct btrfs_trans_handle *trans; - - spin_unlock(&fs_info->qgroup_lock); - ret = btrfs_start_delalloc_inodes(root, 0); - if (ret) - return ret; - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - retried++; - goto retry; - } ret = -EDQUOT; goto out; } @@ -2424,7 +2455,7 @@ retry: list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(fs_info->qgroup_ulist, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) goto out; } @@ -2439,8 +2470,8 @@ retry: qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, num_bytes); - qg->reserved += num_bytes; + trace_qgroup_update_reserve(fs_info, qg, num_bytes, type); + qgroup_rsv_add(fs_info, qg, num_bytes, type); } out: @@ -2448,8 +2479,18 @@ out: return ret; } +/* + * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 + * qgroup). + * + * Will handle all higher level qgroup too. + * + * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. + * This special case is only used for META_PERTRANS type. + */ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, - u64 ref_root, u64 num_bytes) + u64 ref_root, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2463,6 +2504,10 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return; + if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { + WARN(1, "%s: Invalid type to free", __func__); + return; + } spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -2473,9 +2518,16 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, if (!qgroup) goto out; + if (num_bytes == (u64)-1) + /* + * We're freeing all pertrans rsv, get reserved value from + * level 0 qgroup as real num_bytes to free. + */ + num_bytes = qgroup->rsv.values[type]; + ulist_reinit(fs_info->qgroup_ulist); ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - (uintptr_t)qgroup, GFP_ATOMIC); + qgroup_to_aux(qgroup), GFP_ATOMIC); if (ret < 0) goto out; ULIST_ITER_INIT(&uiter); @@ -2485,16 +2537,13 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); - if (qg->reserved < num_bytes) - report_reserved_underflow(fs_info, qg, num_bytes); - else - qg->reserved -= num_bytes; + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type); + qgroup_rsv_release(fs_info, qg, num_bytes, type); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(fs_info->qgroup_ulist, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) goto out; } @@ -2877,7 +2926,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, to_reserve, true); + ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; @@ -2940,7 +2989,8 @@ static int qgroup_free_reserved_data(struct inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed, + BTRFS_QGROUP_RSV_DATA); ret = freed; out: extent_changeset_release(&changeset); @@ -2972,7 +3022,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, if (free) btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: extent_changeset_release(&changeset); @@ -3017,8 +3067,48 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } -int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - bool enforce) +static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + if (type != BTRFS_QGROUP_RSV_META_PREALLOC && + type != BTRFS_QGROUP_RSV_META_PERTRANS) + return; + if (num_bytes == 0) + return; + + spin_lock(&root->qgroup_meta_rsv_lock); + if (type == BTRFS_QGROUP_RSV_META_PREALLOC) + root->qgroup_meta_rsv_prealloc += num_bytes; + else + root->qgroup_meta_rsv_pertrans += num_bytes; + spin_unlock(&root->qgroup_meta_rsv_lock); +} + +static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + if (type != BTRFS_QGROUP_RSV_META_PREALLOC && + type != BTRFS_QGROUP_RSV_META_PERTRANS) + return 0; + if (num_bytes == 0) + return 0; + + spin_lock(&root->qgroup_meta_rsv_lock); + if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { + num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, + num_bytes); + root->qgroup_meta_rsv_prealloc -= num_bytes; + } else { + num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, + num_bytes); + root->qgroup_meta_rsv_pertrans -= num_bytes; + } + spin_unlock(&root->qgroup_meta_rsv_lock); + return num_bytes; +} + +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3028,31 +3118,39 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, (s64)num_bytes); - ret = qgroup_reserve(root, num_bytes, enforce); + trace_qgroup_meta_reserve(root, type, (s64)num_bytes); + ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; - atomic64_add(num_bytes, &root->qgroup_meta_rsv); + /* + * Record what we have reserved into root. + * + * To avoid quota disabled->enabled underflow. + * In that case, we may try to free space we haven't reserved + * (since quota was disabled), so record what we reserved into root. + * And ensure later release won't underflow this number. + */ + add_root_meta_rsv(root, num_bytes, type); return ret; } -void btrfs_qgroup_free_meta_all(struct btrfs_root *root) +void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 reserved; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || !is_fstree(root->objectid)) return; - reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); - if (reserved == 0) - return; - trace_qgroup_meta_reserve(root, -(s64)reserved); - btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); + /* TODO: Update trace point to handle such free */ + trace_qgroup_meta_free_all_pertrans(root); + /* Special value -1 means to free all reserved space */ + btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1, + BTRFS_QGROUP_RSV_META_PERTRANS); } -void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) +void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3060,11 +3158,75 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) !is_fstree(root->objectid)) return; + /* + * reservation for META_PREALLOC can happen before quota is enabled, + * which can lead to underflow. + * Here ensure we will only free what we really have reserved. + */ + num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); - atomic64_sub(num_bytes, &root->qgroup_meta_rsv); - trace_qgroup_meta_reserve(root, -(s64)num_bytes); - btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); + trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); + btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type); +} + +static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, + int num_bytes) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *qgroup; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + if (num_bytes == 0) + return; + if (!quota_root) + return; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + qgroup_to_aux(qgroup), GFP_ATOMIC); + if (ret < 0) + goto out; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = unode_aux_to_qgroup(unode); + + qgroup_rsv_release(fs_info, qg, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); + qgroup_rsv_add(fs_info, qg, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + qgroup_to_aux(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } +out: + spin_unlock(&fs_info->qgroup_lock); +} + +void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + !is_fstree(root->objectid)) + return; + /* Same as btrfs_qgroup_free_meta_prealloc() */ + num_bytes = sub_root_meta_rsv(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); + trace_qgroup_meta_convert(root, num_bytes); + qgroup_convert_meta(fs_info, root->objectid, num_bytes); } /* @@ -3092,7 +3254,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) } btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } extent_changeset_release(&changeset); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d9984e87cddf..e63e2d497a8e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -62,6 +62,48 @@ struct btrfs_qgroup_extent_record { }; /* + * Qgroup reservation types: + * + * DATA: + * space reserved for data + * + * META_PERTRANS: + * Space reserved for metadata (per-transaction) + * Due to the fact that qgroup data is only updated at transaction commit + * time, reserved space for metadata must be kept until transaction + * commits. + * Any metadata reserved that are used in btrfs_start_transaction() should + * be of this type. + * + * META_PREALLOC: + * There are cases where metadata space is reserved before starting + * transaction, and then btrfs_join_transaction() to get a trans handle. + * Any metadata reserved for such usage should be of this type. + * And after join_transaction() part (or all) of such reservation should + * be converted into META_PERTRANS. + */ +enum btrfs_qgroup_rsv_type { + BTRFS_QGROUP_RSV_DATA = 0, + BTRFS_QGROUP_RSV_META_PERTRANS, + BTRFS_QGROUP_RSV_META_PREALLOC, + BTRFS_QGROUP_RSV_LAST, +}; + +/* + * Represents how many bytes we have reserved for this qgroup. + * + * Each type should have different reservation behavior. + * E.g, data follows its io_tree flag modification, while + * *currently* meta is just reserve-and-clear during transcation. + * + * TODO: Add new type for reservation which can survive transaction commit. + * Currect metadata reservation behavior is not suitable for such case. + */ +struct btrfs_qgroup_rsv { + u64 values[BTRFS_QGROUP_RSV_LAST]; +}; + +/* * one struct for each qgroup, organized in fs_info->qgroup_tree. */ struct btrfs_qgroup { @@ -87,7 +129,7 @@ struct btrfs_qgroup { /* * reservation tracking */ - u64 reserved; + struct btrfs_qgroup_rsv rsv; /* * lists @@ -220,20 +262,21 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); -int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, - u64 ref_root, u64 num_bytes); + u64 ref_root, u64 num_bytes, + enum btrfs_qgroup_rsv_type type); static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, + BTRFS_QGROUP_RSV_DATA); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -248,9 +291,54 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len); -int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - bool enforce); -void btrfs_qgroup_free_meta_all(struct btrfs_root *root); -void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes); +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce); +/* Reserve metadata space for pertrans and prealloc type */ +static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, + int num_bytes, bool enforce) +{ + return __btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS, enforce); +} +static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, + int num_bytes, bool enforce) +{ + return __btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC, enforce); +} + +void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type); + +/* Free per-transaction meta reservation for error handling */ +static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, + int num_bytes) +{ + __btrfs_qgroup_free_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); +} + +/* Pre-allocated meta reservation can be freed at need */ +static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, + int num_bytes) +{ + __btrfs_qgroup_free_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); +} + +/* + * Per-transaction meta reservation should be all freed at transaction commit + * time + */ +void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); + +/* + * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. + * + * This is called when preallocated meta reservation needs to be used. + * Normally after btrfs_join_transaction() call. + */ +void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); + void btrfs_qgroup_check_reserved_leak(struct inode *inode); #endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index dec0907dfb8a..c3a2bc8af675 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1370,6 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe_start = stripe->physical; if (physical >= stripe_start && physical < stripe_start + rbio->stripe_len && + stripe->dev->bdev && bio->bi_disk == stripe->dev->bdev->bd_disk && bio->bi_partno == stripe->dev->bdev->bd_partno) { return i; @@ -1986,7 +1987,13 @@ cleanup: kfree(pointers); cleanup_io: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + /* + * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a + * valid rbio which is consistent with ondisk content, thus such a + * valid rbio can be cached to avoid further disk reads. + */ + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { /* * - In case of two failures, where rbio->failb != -1: * @@ -2008,8 +2015,6 @@ cleanup_io: clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); rbio_orig_end_io(rbio, err); - } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - rbio_orig_end_io(rbio, err); } else if (err == BLK_STS_OK) { rbio->faila = -1; rbio->failb = -1; @@ -2767,24 +2772,8 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, return rbio; } -static void missing_raid56_work(struct btrfs_work *work) -{ - struct btrfs_raid_bio *rbio; - - rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); -} - -static void async_missing_raid56(struct btrfs_raid_bio *rbio) -{ - btrfs_init_work(&rbio->work, btrfs_rmw_helper, - missing_raid56_work, NULL, NULL); - - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); -} - void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - async_missing_raid56(rbio); + async_read_rebuild(rbio); } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index ab852b8e3e37..a52dd12af648 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -395,20 +395,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); goto error; } @@ -451,13 +451,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); goto error; } have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); if (!have_zone) goto error; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 171f3cce30e6..35fab67dcbe8 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -579,11 +579,16 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, while (level >= 0) { if (level) { + struct btrfs_key first_key; + block_bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); gen = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); - eb = read_tree_block(fs_info, block_bytenr, gen); + btrfs_node_key_to_cpu(path->nodes[level], &first_key, + path->slots[level]); + eb = read_tree_block(fs_info, block_bytenr, gen, + level - 1, &first_key); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f0c3f00e97cb..4874c09f6d3c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1839,6 +1839,8 @@ again: parent = eb; while (1) { + struct btrfs_key first_key; + level = btrfs_header_level(parent); BUG_ON(level < lowest_level); @@ -1852,6 +1854,7 @@ again: old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + btrfs_node_key_to_cpu(parent, &key, slot); if (level <= max_level) { eb = path->nodes[level]; @@ -1876,7 +1879,8 @@ again: break; } - eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen); + eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, + level - 1, &first_key); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -2036,6 +2040,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = *level; i > 0; i--) { + struct btrfs_key first_key; + eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] < nritems) { @@ -2056,7 +2062,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, } bytenr = btrfs_node_blockptr(eb, path->slots[i]); - eb = read_tree_block(fs_info, bytenr, ptr_gen); + btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); + eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, + &first_key); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2714,6 +2722,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { + struct btrfs_key first_key; + cond_resched(); upper = edge->node[UPPER]; @@ -2779,7 +2789,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->fs_info->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(fs_info, bytenr, generation); + btrfs_node_key_to_cpu(upper->eb, &first_key, slot); + eb = read_tree_block(fs_info, bytenr, generation, + upper->level - 1, &first_key); if (IS_ERR(eb)) { err = PTR_ERR(eb); goto next; @@ -2944,7 +2956,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; BUG_ON(block->key_ready); - eb = read_tree_block(fs_info, block->bytenr, block->key.offset); + eb = read_tree_block(fs_info, block->bytenr, block->key.offset, + block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -3226,7 +3239,7 @@ static int relocate_file_extent_cluster(struct inode *inode, mask); if (!page) { btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE); + PAGE_SIZE, true); ret = -ENOMEM; goto out; } @@ -3245,9 +3258,9 @@ static int relocate_file_extent_cluster(struct inode *inode, unlock_page(page); put_page(page); btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE); + PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); + PAGE_SIZE, true); ret = -EIO; goto out; } @@ -3268,8 +3281,22 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL, - 0); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + NULL, 0); + if (ret) { + unlock_page(page); + put_page(page); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE, true); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, + EXTENT_LOCKED | EXTENT_BOUNDARY); + goto out; + + } set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, @@ -3278,7 +3305,8 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, + false); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ec56f33feea9..1a2066ac6fe7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -371,7 +371,7 @@ static struct full_stripe_lock *insert_full_stripe_lock( struct full_stripe_lock *entry; struct full_stripe_lock *ret; - WARN_ON(!mutex_is_locked(&locks_root->lock)); + lockdep_assert_held(&locks_root->lock); p = &locks_root->root.rb_node; while (*p) { @@ -413,7 +413,7 @@ static struct full_stripe_lock *search_full_stripe_lock( struct rb_node *node; struct full_stripe_lock *entry; - WARN_ON(!mutex_is_locked(&locks_root->lock)); + lockdep_assert_held(&locks_root->lock); node = locks_root->root.rb_node; while (node) { @@ -1111,7 +1111,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) struct scrub_ctx *sctx = sblock_to_check->sctx; struct btrfs_device *dev; struct btrfs_fs_info *fs_info; - u64 length; u64 logical; unsigned int failed_mirror_index; unsigned int is_metadata; @@ -1139,7 +1138,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); return 0; } - length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0]->logical; BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; @@ -1412,8 +1410,17 @@ nodatasum_case: if (!page_bad->io_error && !sctx->is_dev_replace) continue; - /* try to find no-io-error page in mirrors */ - if (page_bad->io_error) { + if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + /* + * In case of dev replace, if raid56 rebuild process + * didn't work out correct data, then copy the content + * in sblock_bad to make sure target device is identical + * to source device, instead of writing garbage data in + * sblock_for_recheck array to target device. + */ + sblock_other = NULL; + } else if (page_bad->io_error) { + /* try to find no-io-error page in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && sblocks_for_recheck[mirror_index].page_count > 0; @@ -1718,6 +1725,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return blk_status_to_errno(bio->bi_status); } +static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock) +{ + struct scrub_page *first_page = sblock->pagev[0]; + struct bio *bio; + int page_num; + + /* All pages in sblock belong to the same stripe on the same device. */ + ASSERT(first_page->dev); + if (!first_page->dev->bdev) + goto out; + + bio = btrfs_io_bio_alloc(BIO_MAX_PAGES); + bio_set_dev(bio, first_page->dev->bdev); + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct scrub_page *page = sblock->pagev[page_num]; + + WARN_ON(!page->page); + bio_add_page(bio, page->page, PAGE_SIZE, 0); + } + + if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { + bio_put(bio); + goto out; + } + + bio_put(bio); + + scrub_recheck_block_checksum(sblock); + + return; +out: + for (page_num = 0; page_num < sblock->page_count; page_num++) + sblock->pagev[page_num]->io_error = 1; + + sblock->no_io_error_seen = 0; +} + /* * this function will check the on disk data for checksum errors, header * errors and read I/O errors. If any I/O errors happen, the exact pages @@ -1733,6 +1779,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, sblock->no_io_error_seen = 1; + /* short cut for raid56 */ + if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) + return scrub_recheck_block_on_raid56(fs_info, sblock); + for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; struct scrub_page *page = sblock->pagev[page_num]; @@ -1748,19 +1798,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio_set_dev(bio, page->dev->bdev); bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { - if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - } - } else { - bio->bi_iter.bi_sector = page->physical >> 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_iter.bi_sector = page->physical >> 9; + bio->bi_opf = REQ_OP_READ; - if (btrfsic_submit_bio_wait(bio)) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - } + if (btrfsic_submit_bio_wait(bio)) { + page->io_error = 1; + sblock->no_io_error_seen = 0; } bio_put(bio); @@ -2728,7 +2771,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) } /* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, +static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, + u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u64 physical_for_dev_replace) { @@ -2737,13 +2781,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->fs_info->sectorsize; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + blocksize = map->stripe_len; + else + blocksize = sctx->fs_info->sectorsize; spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed++; sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->fs_info->nodesize; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + blocksize = map->stripe_len; + else + blocksize = sctx->fs_info->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; sctx->stat.tree_bytes_scrubbed += len; @@ -2883,9 +2933,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->fs_info->sectorsize; + blocksize = sparity->stripe_len; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->fs_info->nodesize; + blocksize = sparity->stripe_len; } else { blocksize = sctx->fs_info->sectorsize; WARN_ON(1); @@ -3595,7 +3645,7 @@ again: if (ret) goto out; - ret = scrub_extent(sctx, extent_logical, extent_len, + ret = scrub_extent(sctx, map, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, extent_logical - logical + physical); @@ -3885,11 +3935,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } - btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + btrfs_dev_replace_write_lock(&fs_info->dev_replace); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + btrfs_dev_replace_write_unlock(&fs_info->dev_replace); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3925,10 +3975,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + btrfs_dev_replace_write_lock(&fs_info->dev_replace); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + btrfs_dev_replace_write_unlock(&fs_info->dev_replace); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -4144,16 +4194,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -EINPROGRESS; } - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { @@ -4480,7 +4530,8 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, * move on to the next inode. */ if (em->block_start > logical || - em->block_start + em->block_len < logical + len) { + em->block_start + em->block_len < logical + len || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { free_extent_map(em); ret = 1; goto out_unlock; @@ -4620,7 +4671,6 @@ static int write_page_nocow(struct scrub_ctx *sctx, { struct bio *bio; struct btrfs_device *dev; - int ret; dev = sctx->wr_tgtdev; if (!dev) @@ -4635,17 +4685,15 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { -leave_with_eio: + /* bio_add_page won't fail on a freshly allocated bio */ + bio_add_page(bio, page, PAGE_SIZE, 0); + + if (btrfsic_submit_bio_wait(bio)) { bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - if (btrfsic_submit_bio_wait(bio)) - goto leave_with_eio; - bio_put(bio); return 0; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f306c608dc28..1f5748c7d1c7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -27,10 +27,10 @@ #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> +#include <linux/crc32c.h> #include "send.h" #include "backref.h" -#include "hash.h" #include "locking.h" #include "disk-io.h" #include "btrfs_inode.h" @@ -112,6 +112,7 @@ struct send_ctx { u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; + u64 cur_inode_next_write_offset; u64 send_progress; @@ -270,6 +271,7 @@ struct name_cache_entry { char name[]; }; +__cold static void inconsistent_snapshot_error(struct send_ctx *sctx, enum btrfs_compare_tree_result result, const char *what) @@ -611,9 +613,9 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, } -#define TLV_PUT(sctx, attrtype, attrlen, data) \ +#define TLV_PUT(sctx, attrtype, data, attrlen) \ do { \ - ret = tlv_put(sctx, attrtype, attrlen, data); \ + ret = tlv_put(sctx, attrtype, data, attrlen); \ if (ret < 0) \ goto tlv_put_failure; \ } while (0) @@ -695,7 +697,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -5005,6 +5007,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) u64 len; int ret = 0; + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, end - offset); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -5026,6 +5031,7 @@ static int send_hole(struct send_ctx *sctx, u64 end) break; offset += len; } + sctx->cur_inode_next_write_offset = offset; tlv_put_failure: fs_path_free(p); return ret; @@ -5261,6 +5267,7 @@ static int send_write_or_clone(struct send_ctx *sctx, } else { ret = send_extent_data(sctx, offset, len); } + sctx->cur_inode_next_write_offset = offset + len; out: return ret; } @@ -5785,6 +5792,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 right_gid; int need_chmod = 0; int need_chown = 0; + int need_truncate = 1; int pending_move = 0; int refs_processed = 0; @@ -5822,9 +5830,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode)) need_chmod = 1; + if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size) + need_truncate = 0; } else { + u64 old_size; + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, - NULL, NULL, &right_mode, &right_uid, + &old_size, NULL, &right_mode, &right_uid, &right_gid, NULL); if (ret < 0) goto out; @@ -5833,6 +5845,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; + if ((old_size == sctx->cur_inode_size) || + (sctx->cur_inode_size > old_size && + sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) + need_truncate = 0; } if (S_ISREG(sctx->cur_inode_mode)) { @@ -5851,10 +5867,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; } } - ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_inode_size); - if (ret < 0) - goto out; + if (need_truncate) { + ret = send_truncate(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + sctx->cur_inode_size); + if (ret < 0) + goto out; + } } if (need_chown) { @@ -5908,6 +5927,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; sctx->cur_inode_last_extent = (u64)-1; + sctx->cur_inode_next_write_offset = 0; /* * Set send_progress to current inode. This will tell all get_cur_xxx diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6e71a2a78363..170baef49fae 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -41,6 +41,7 @@ #include <linux/slab.h> #include <linux/cleancache.h> #include <linux/ratelimit.h> +#include <linux/crc32c.h> #include <linux/btrfs.h> #include "delayed-inode.h" #include "ctree.h" @@ -48,7 +49,6 @@ #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" -#include "hash.h" #include "props.h" #include "xattr.h" #include "volumes.h" @@ -308,21 +308,50 @@ static void btrfs_put_super(struct super_block *sb) } enum { - Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, - Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, - Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_type, Opt_compress_force, Opt_compress_force_type, - Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_space_cache_version, Opt_clear_cache, - Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, - Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, - Opt_skip_balance, Opt_check_integrity, + Opt_acl, Opt_noacl, + Opt_clear_cache, + Opt_commit_interval, + Opt_compress, + Opt_compress_force, + Opt_compress_force_type, + Opt_compress_type, + Opt_degraded, + Opt_device, + Opt_fatal_errors, + Opt_flushoncommit, Opt_noflushoncommit, + Opt_inode_cache, Opt_noinode_cache, + Opt_max_inline, + Opt_barrier, Opt_nobarrier, + Opt_datacow, Opt_nodatacow, + Opt_datasum, Opt_nodatasum, + Opt_defrag, Opt_nodefrag, + Opt_discard, Opt_nodiscard, + Opt_nologreplay, + Opt_norecovery, + Opt_ratio, + Opt_rescan_uuid_tree, + Opt_skip_balance, + Opt_space_cache, Opt_no_space_cache, + Opt_space_cache_version, + Opt_ssd, Opt_nossd, + Opt_ssd_spread, Opt_nossd_spread, + Opt_subvol, + Opt_subvolid, + Opt_thread_pool, + Opt_treelog, Opt_notreelog, + Opt_usebackuproot, + Opt_user_subvol_rm_allowed, + + /* Deprecated options */ + Opt_alloc_start, + Opt_recovery, + Opt_subvolrootid, + + /* Debugging options */ + Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, - Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, - Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, - Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot, - Opt_nologreplay, Opt_norecovery, + Opt_check_integrity_print_mask, + Opt_enospc_debug, Opt_noenospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif @@ -333,58 +362,63 @@ enum { }; static const match_table_t tokens = { - {Opt_degraded, "degraded"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvolid, "subvolid=%s"}, - {Opt_device, "device=%s"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_datasum, "datasum"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_datacow, "datacow"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_barrier, "barrier"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_alloc_start, "alloc_start=%s"}, - {Opt_thread_pool, "thread_pool=%d"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_commit_interval, "commit=%u"}, {Opt_compress, "compress"}, {Opt_compress_type, "compress=%s"}, {Opt_compress_force, "compress-force"}, {Opt_compress_force_type, "compress-force=%s"}, - {Opt_ssd, "ssd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd, "nossd"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_notreelog, "notreelog"}, - {Opt_treelog, "treelog"}, - {Opt_nologreplay, "nologreplay"}, - {Opt_norecovery, "norecovery"}, + {Opt_degraded, "degraded"}, + {Opt_device, "device=%s"}, + {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_noflushoncommit, "noflushoncommit"}, - {Opt_ratio, "metadata_ratio=%d"}, + {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_datacow, "datacow"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_datasum, "datasum"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, + {Opt_nologreplay, "nologreplay"}, + {Opt_norecovery, "norecovery"}, + {Opt_ratio, "metadata_ratio=%u"}, + {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, + {Opt_skip_balance, "skip_balance"}, {Opt_space_cache, "space_cache"}, + {Opt_no_space_cache, "nospace_cache"}, {Opt_space_cache_version, "space_cache=%s"}, - {Opt_clear_cache, "clear_cache"}, + {Opt_ssd, "ssd"}, + {Opt_nossd, "nossd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd_spread, "nossd_spread"}, + {Opt_subvol, "subvol=%s"}, + {Opt_subvolid, "subvolid=%s"}, + {Opt_thread_pool, "thread_pool=%u"}, + {Opt_treelog, "treelog"}, + {Opt_notreelog, "notreelog"}, + {Opt_usebackuproot, "usebackuproot"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, - {Opt_enospc_debug, "enospc_debug"}, - {Opt_noenospc_debug, "noenospc_debug"}, + + /* Deprecated options */ + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_recovery, "recovery"}, {Opt_subvolrootid, "subvolrootid=%d"}, - {Opt_defrag, "autodefrag"}, - {Opt_nodefrag, "noautodefrag"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_noinode_cache, "noinode_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_recovery, "recovery"}, /* deprecated */ - {Opt_usebackuproot, "usebackuproot"}, - {Opt_skip_balance, "skip_balance"}, + + /* Debugging options */ {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, - {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_commit_interval, "commit=%d"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%u"}, + {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, #ifdef CONFIG_BTRFS_DEBUG {Opt_fragment_data, "fragment=data"}, {Opt_fragment_metadata, "fragment=metadata"}, @@ -579,6 +613,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, NOSSD); btrfs_clear_and_info(info, SSD, "not using ssd optimizations"); + /* Fallthrough */ + case Opt_nossd_spread: btrfs_clear_and_info(info, SSD_SPREAD, "not using spread ssd allocation scheme"); break; @@ -594,12 +630,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, ret = match_int(&args[0], &intarg); if (ret) { goto out; - } else if (intarg > 0) { - info->thread_pool_size = intarg; - } else { + } else if (intarg == 0) { ret = -EINVAL; goto out; } + info->thread_pool_size = intarg; break; case Opt_max_inline: num = match_strdup(&args[0]); @@ -658,16 +693,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) { + if (ret) goto out; - } else if (intarg >= 0) { - info->metadata_ratio = intarg; - btrfs_info(info, "metadata ratio %d", - info->metadata_ratio); - } else { - ret = -EINVAL; - goto out; - } + info->metadata_ratio = intarg; + btrfs_info(info, "metadata ratio %u", + info->metadata_ratio); break; case Opt_discard: btrfs_set_and_info(info, DISCARD, @@ -762,17 +792,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) { + if (ret) goto out; - } else if (intarg >= 0) { - info->check_integrity_print_mask = intarg; - btrfs_info(info, - "check_integrity_print_mask 0x%x", - info->check_integrity_print_mask); - } else { - ret = -EINVAL; - goto out; - } + info->check_integrity_print_mask = intarg; + btrfs_info(info, "check_integrity_print_mask 0x%x", + info->check_integrity_print_mask); break; #else case Opt_check_integrity_including_extent_data: @@ -798,24 +822,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret < 0) { - btrfs_err(info, "invalid commit interval"); - ret = -EINVAL; + if (ret) goto out; - } - if (intarg > 0) { - if (intarg > 300) { - btrfs_warn(info, - "excessive commit interval %d", - intarg); - } - info->commit_interval = intarg; - } else { + if (intarg == 0) { btrfs_info(info, - "using default commit interval %ds", + "using default commit interval %us", BTRFS_DEFAULT_COMMIT_INTERVAL); - info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + intarg = BTRFS_DEFAULT_COMMIT_INTERVAL; + } else if (intarg > 300) { + btrfs_warn(info, "excessive commit interval %d", + intarg); } + info->commit_interval = intarg; break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -932,8 +950,8 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags, { substring_t args[MAX_OPT_ARGS]; char *opts, *orig, *p; - char *num = NULL; int error = 0; + u64 subvolid; if (!options) return 0; @@ -963,18 +981,15 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags, } break; case Opt_subvolid: - num = match_strdup(&args[0]); - if (num) { - *subvol_objectid = memparse(num, NULL); - kfree(num); - /* we want the original fs_tree */ - if (!*subvol_objectid) - *subvol_objectid = - BTRFS_FS_TREE_OBJECTID; - } else { - error = -EINVAL; + error = match_u64(&args[0], &subvolid); + if (error) goto out; - } + + /* we want the original fs_tree */ + if (subvolid == 0) + subvolid = BTRFS_FS_TREE_OBJECTID; + + *subvol_objectid = subvolid; break; case Opt_subvolrootid: pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); @@ -1284,7 +1299,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) - seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + seq_printf(seq, ",thread_pool=%u", info->thread_pool_size); if (btrfs_test_opt(info, COMPRESS)) { compress_type = btrfs_compress_type2str(info->compress_type); if (btrfs_test_opt(info, FORCE_COMPRESS)) @@ -1340,12 +1355,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) info->check_integrity_print_mask); #endif if (info->metadata_ratio) - seq_printf(seq, ",metadata_ratio=%d", - info->metadata_ratio); + seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) - seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",commit=%u", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); @@ -1545,7 +1559,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, * it for searching for existing supers, so this lets us do that and * then open_ctree will properly initialize everything later. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; @@ -1690,7 +1704,7 @@ out: } static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, - int new_pool_size, int old_pool_size) + u32 new_pool_size, u32 old_pool_size) { if (new_pool_size == old_pool_size) return; @@ -1758,8 +1772,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; u64 old_max_inline = fs_info->max_inline; - int old_thread_pool_size = fs_info->thread_pool_size; - unsigned int old_metadata_ratio = fs_info->metadata_ratio; + u32 old_thread_pool_size = fs_info->thread_pool_size; + u32 old_metadata_ratio = fs_info->metadata_ratio; int ret; sync_filesystem(sb); @@ -2290,11 +2304,18 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) struct list_head *head; struct rcu_string *name; - mutex_lock(&fs_info->fs_devices->device_list_mutex); + /* + * Lightweight locking of the devices. We should not need + * device_list_mutex here as we only read the device data and the list + * is protected by RCU. Even if a device is deleted during the list + * traversals, we'll get valid data, the freeing callback will wait at + * least until until the rcu_read_unlock. + */ + rcu_read_lock(); cur_devices = fs_info->fs_devices; while (cur_devices) { head = &cur_devices->devices; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry_rcu(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->name) @@ -2306,14 +2327,12 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) } if (first_dev) { - rcu_read_lock(); name = rcu_dereference(first_dev->name); seq_escape(m, name->str, " \t\n\\"); - rcu_read_unlock(); } else { WARN_ON(1); } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + rcu_read_unlock(); return 0; } @@ -2355,7 +2374,7 @@ static int __init btrfs_interface_init(void) return misc_register(&btrfs_misc); } -static void btrfs_interface_exit(void) +static __cold void btrfs_interface_exit(void) { misc_deregister(&btrfs_misc); } @@ -2376,22 +2395,18 @@ static void __init btrfs_print_mod_info(void) ", ref-verify=on" #endif "\n", - btrfs_crc32c_impl()); + crc32c_impl()); } static int __init init_btrfs_fs(void) { int err; - err = btrfs_hash_init(); - if (err) - return err; - btrfs_props_init(); err = btrfs_init_sysfs(); if (err) - goto free_hash; + return err; btrfs_init_compress(); @@ -2472,8 +2487,7 @@ free_cachep: free_compress: btrfs_exit_compress(); btrfs_exit_sysfs(); -free_hash: - btrfs_hash_exit(); + return err; } @@ -2493,7 +2507,6 @@ static void __exit exit_btrfs_fs(void) btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); btrfs_exit_compress(); - btrfs_hash_exit(); } late_initcall(init_btrfs_fs); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a8bafed931f4..ca067471cd46 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -272,7 +272,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group_cache *block_group; - int index = to_raid_kobj(kobj)->raid_type; + int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); u64 val = 0; down_read(&sinfo->groups_sem); @@ -923,7 +923,7 @@ out1: return ret; } -void btrfs_exit_sysfs(void) +void __cold btrfs_exit_sysfs(void) { sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9786d8cd0aa6..e74278170806 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -278,8 +278,7 @@ int btrfs_run_sanity_tests(void) } } ret = btrfs_test_extent_map(); - if (ret) - goto out; + out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 70c993f01670..c23bd00bdd92 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -343,7 +343,7 @@ static void test_case_4(struct extent_map_tree *em_tree) __test_case_4(em_tree, SZ_4K); } -int btrfs_test_extent_map() +int btrfs_test_extent_map(void) { struct extent_map_tree *em_tree; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 90204b166643..160eb2fba726 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -63,7 +63,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, btrfs_set_extent_generation(leaf, item, 1); btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); block_info = (struct btrfs_tree_block_info *)(item + 1); - btrfs_set_tree_block_level(leaf, block_info, 1); + btrfs_set_tree_block_level(leaf, block_info, 0); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); if (parent > 0) { btrfs_set_extent_inline_ref_type(leaf, iref, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04f07144b45c..5c4cf0f9146b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -37,22 +37,16 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, - [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | - __TRANS_START), - [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | - __TRANS_START | - __TRANS_ATTACH), - [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | - __TRANS_START | + [TRANS_STATE_BLOCKED] = __TRANS_START, + [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN), - [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | - __TRANS_START | + [TRANS_STATE_UNBLOCKED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK), - [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | - __TRANS_START | + [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK), @@ -126,9 +120,9 @@ static void clear_btree_io_tree(struct extent_io_tree *tree) spin_unlock(&tree->lock); } -static noinline void switch_commit_roots(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) +static noinline void switch_commit_roots(struct btrfs_transaction *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; down_write(&fs_info->commit_root_sem); @@ -319,7 +313,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && root->last_trans < trans->transid) || force) { WARN_ON(root == fs_info->extent_root); - WARN_ON(root->commit_root != root->node); + WARN_ON(!force && root->commit_root != root->node); /* * see below for IN_TRANS_SETUP usage rules @@ -449,11 +443,7 @@ static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return 0; - if (type == TRANS_USERSPACE) - return 1; - - if (type == TRANS_START && - !atomic_read(&fs_info->open_ioctl_trans)) + if (type == TRANS_START) return 1; return 0; @@ -508,8 +498,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, */ if (num_items && root != fs_info->chunk_root) { qgroup_reserved = num_items * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved, - enforce_qgroups); + ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, + enforce_qgroups); if (ret) return ERR_PTR(ret); @@ -593,7 +583,7 @@ again: got_it: btrfs_record_root_in_trans(h, root); - if (!current->journal_info && type != TRANS_USERSPACE) + if (!current->journal_info) current->journal_info = h; return h; @@ -606,7 +596,7 @@ alloc_fail: btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, num_bytes); reserve_fail: - btrfs_qgroup_free_meta(root, qgroup_reserved); + btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); return ERR_PTR(ret); } @@ -658,14 +648,6 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( return trans; } -struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, - unsigned int num_items) -{ - return start_transaction(root, num_items, TRANS_START, - BTRFS_RESERVE_FLUSH_LIMIT, true); -} - struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH, @@ -678,12 +660,6 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root BTRFS_RESERVE_NO_FLUSH, true); } -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) -{ - return start_transaction(root, 0, TRANS_USERSPACE, - BTRFS_RESERVE_NO_FLUSH, true); -} - /* * btrfs_attach_transaction() - catch the running transaction * @@ -789,8 +765,7 @@ out: void btrfs_throttle(struct btrfs_fs_info *fs_info) { - if (!atomic_read(&fs_info->open_ioctl_trans)) - wait_current_trans(fs_info); + wait_current_trans(fs_info); } static int should_end_transaction(struct btrfs_trans_handle *trans) @@ -806,7 +781,6 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_fs_info *fs_info = trans->fs_info; int updates; int err; @@ -818,7 +792,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { - err = btrfs_run_delayed_refs(trans, fs_info, updates * 2); + err = btrfs_run_delayed_refs(trans, updates * 2); if (err) /* Error code will also eval true */ return err; } @@ -826,6 +800,27 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) return should_end_transaction(trans); } +static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) + +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->block_rsv) { + ASSERT(!trans->bytes_reserved); + return; + } + + if (!trans->bytes_reserved) + return; + + ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, trans->bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, + trans->bytes_reserved); + trans->bytes_reserved = 0; +} + static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int throttle) { @@ -843,11 +838,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } - btrfs_trans_release_metadata(trans, info); + btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, info); + btrfs_create_pending_block_groups(trans); trans->delayed_ref_updates = 0; if (!trans->sync) { @@ -864,16 +859,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, must_run_delayed_refs = 2; } - btrfs_trans_release_metadata(trans, info); + btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, info); + btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); - if (lock && !atomic_read(&info->open_ioctl_trans) && - should_end_transaction(trans) && + if (lock && should_end_transaction(trans) && READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { spin_lock(&info->trans_lock); if (cur_trans->state == TRANS_STATE_RUNNING) @@ -1072,40 +1066,33 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) } /* - * when btree blocks are allocated, they have some corresponding bits set for - * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * When btree blocks are allocated the corresponding extents are marked dirty. + * This function ensures such extents are persisted on disk for transaction or + * log commit. + * + * @trans: transaction whose dirty pages we'd like to write */ -static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, int mark) +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) { int ret; int ret2; + struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages; + struct btrfs_fs_info *fs_info = trans->fs_info; struct blk_plug plug; blk_start_plug(&plug); - ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark); + ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY); blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); + clear_btree_io_tree(&trans->transaction->dirty_pages); + if (ret) return ret; - if (ret2) + else if (ret2) return ret2; - return 0; -} - -static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - int ret; - - ret = btrfs_write_and_wait_marked_extents(fs_info, - &trans->transaction->dirty_pages, - EXTENT_DIRTY); - clear_btree_io_tree(&trans->transaction->dirty_pages); - - return ret; + else + return 0; } /* @@ -1155,9 +1142,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, * failures will cause the file system to go offline. We still need * to clean up the delayed refs. */ -static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; @@ -1173,7 +1160,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; @@ -1192,7 +1179,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; again: @@ -1209,7 +1196,7 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; } @@ -1218,7 +1205,7 @@ again: ret = btrfs_write_dirty_block_groups(trans, fs_info); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; } @@ -1251,9 +1238,9 @@ void btrfs_add_dead_root(struct btrfs_root *root) /* * update all the cowonly tree roots on disk */ -static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *gang[8]; int i; int ret; @@ -1297,7 +1284,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; - btrfs_qgroup_free_meta_all(root); + btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1366,15 +1353,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return 0; /* + * Ensure dirty @src will be commited. Or, after comming + * commit_fs_roots() and switch_commit_roots(), any dirty but not + * recorded root will never be updated again, causing an outdated root + * item. + */ + record_root_in_trans(trans, src, 1); + + /* * We are going to commit transaction, see btrfs_commit_transaction() * comment for reason locking tree_log_mutex */ mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans, fs_info); + ret = commit_fs_roots(trans); if (ret) goto out; - ret = btrfs_qgroup_account_extents(trans, fs_info); + ret = btrfs_qgroup_account_extents(trans); if (ret < 0) goto out; @@ -1397,11 +1392,11 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * like chunk and root tree, as they won't affect qgroup. * And we don't write super to avoid half committed status. */ - ret = commit_cowonly_roots(trans, fs_info); + ret = commit_cowonly_roots(trans); if (ret) goto out; - switch_commit_roots(trans->transaction, fs_info); - ret = btrfs_write_and_wait_transaction(trans, fs_info); + switch_commit_roots(trans->transaction); + ret = btrfs_write_and_wait_transaction(trans); if (ret) btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction for qgroup"); @@ -1430,9 +1425,10 @@ out: * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_pending_snapshot *pending) { + + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key key; struct btrfs_root_item *new_root_item; struct btrfs_root *tree_root = fs_info->tree_root; @@ -1524,7 +1520,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * otherwise we corrupt the FS during * snapshot */ - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); if (ret) { /* Transaction aborted */ btrfs_abort_transaction(trans, ret); goto fail; @@ -1620,7 +1616,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1674,7 +1670,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1699,8 +1695,7 @@ no_free_objectid: /* * create all the snapshots we've scheduled for creation */ -static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) { struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; @@ -1708,7 +1703,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, list_for_each_entry_safe(pending, next, head, list) { list_del(&pending->list); - ret = create_pending_snapshot(trans, fs_info, pending); + ret = create_pending_snapshot(trans, pending); if (ret) break; } @@ -1861,10 +1856,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, } -static void cleanup_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int err) +static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; DEFINE_WAIT(wait); @@ -1904,7 +1898,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); - trace_btrfs_transaction_commit(root); + trace_btrfs_transaction_commit(trans->root); if (current->journal_info == trans) current->journal_info = NULL; @@ -1959,13 +1953,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ - ret = btrfs_run_delayed_refs(trans, fs_info, 0); + ret = btrfs_run_delayed_refs(trans, 0); if (ret) { btrfs_end_transaction(trans); return ret; } - btrfs_trans_release_metadata(trans, fs_info); + btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; cur_trans = trans->transaction; @@ -1978,9 +1972,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) smp_wmb(); if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); - ret = btrfs_run_delayed_refs(trans, fs_info, 0); + ret = btrfs_run_delayed_refs(trans, 0); if (ret) { btrfs_end_transaction(trans); return ret; @@ -2008,12 +2002,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) run_it = 1; mutex_unlock(&fs_info->ro_block_group_mutex); - if (run_it) - ret = btrfs_start_dirty_block_groups(trans, fs_info); - } - if (ret) { - btrfs_end_transaction(trans); - return ret; + if (run_it) { + ret = btrfs_start_dirty_block_groups(trans); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + } } spin_lock(&fs_info->trans_lock); @@ -2061,7 +2056,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto cleanup_transaction; - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); if (ret) goto cleanup_transaction; @@ -2069,7 +2064,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); if (ret) goto cleanup_transaction; @@ -2106,7 +2101,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * deal with them in create_pending_snapshot(), which is the * core function of the snapshot creation. */ - ret = create_pending_snapshots(trans, fs_info); + ret = create_pending_snapshots(trans); if (ret) { mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; @@ -2122,13 +2117,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * because all the tree which are snapshoted will be forced to COW * the nodes and leaves. */ - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); if (ret) { mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; @@ -2157,7 +2152,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans, fs_info); + ret = commit_fs_roots(trans); if (ret) { mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&fs_info->reloc_mutex); @@ -2179,7 +2174,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * commit_fs_roots() can call btrfs_save_ino_cache(), which generates * new delayed refs. Must handle them or qgroup can be wrong. */ - ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&fs_info->reloc_mutex); @@ -2190,14 +2185,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ - ret = btrfs_qgroup_account_extents(trans, fs_info); + ret = btrfs_qgroup_account_extents(trans); if (ret < 0) { mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } - ret = commit_cowonly_roots(trans, fs_info); + ret = commit_cowonly_roots(trans); if (ret) { mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&fs_info->reloc_mutex); @@ -2229,7 +2224,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); - switch_commit_roots(cur_trans, fs_info); + switch_commit_roots(cur_trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); @@ -2241,7 +2236,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) sizeof(*fs_info->super_copy)); btrfs_update_commit_device_size(fs_info); - btrfs_update_commit_device_bytes_used(fs_info, cur_trans); + btrfs_update_commit_device_bytes_used(cur_trans); clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); @@ -2256,7 +2251,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_wait); - ret = btrfs_write_and_wait_transaction(trans, fs_info); + ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); @@ -2273,7 +2268,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; - btrfs_finish_extent_commit(trans, fs_info); + btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); @@ -2319,13 +2314,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: - btrfs_trans_release_metadata(trans, fs_info); + btrfs_trans_release_metadata(trans); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; - cleanup_transaction(trans, trans->root, ret); + cleanup_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6beee072b1bd..b6c94ce33503 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -69,6 +69,22 @@ struct btrfs_transaction { struct list_head pending_chunks; struct list_head switch_commits; struct list_head dirty_bgs; + + /* + * There is no explicit lock which protects io_bgs, rather its + * consistency is implied by the fact that all the sites which modify + * it do so under some form of transaction critical section, namely: + * + * - btrfs_start_dirty_block_groups - This function can only ever be + * run by one of the transaction committers. Refer to + * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction + * + * - btrfs_write_dirty_blockgroups - this is called by + * commit_cowonly_roots from transaction critical section + * (TRANS_STATE_COMMIT_DOING) + * + * - btrfs_cleanup_dirty_bgs - called on transaction abort + */ struct list_head io_bgs; struct list_head dropped_roots; @@ -89,21 +105,18 @@ struct btrfs_transaction { #define __TRANS_FREEZABLE (1U << 0) -#define __TRANS_USERSPACE (1U << 8) #define __TRANS_START (1U << 9) #define __TRANS_ATTACH (1U << 10) #define __TRANS_JOIN (1U << 11) #define __TRANS_JOIN_NOLOCK (1U << 12) #define __TRANS_DUMMY (1U << 13) -#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) #define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) #define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) -#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ - __TRANS_ATTACH) +#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) @@ -186,15 +199,11 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, unsigned int num_items, int min_factor); -struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, - unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c3c8d48f6618..8871286c1a91 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -30,7 +30,6 @@ #include "tree-checker.h" #include "disk-io.h" #include "compression.h" -#include "hash.h" /* * Error message should follow the following format: @@ -53,7 +52,8 @@ * Allows callers to customize the output. */ __printf(4, 5) -static void generic_err(const struct btrfs_root *root, +__cold +static void generic_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { @@ -65,10 +65,10 @@ static void generic_err(const struct btrfs_root *root, vaf.fmt = fmt; vaf.va = &args; - btrfs_crit(root->fs_info, + btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", - root->objectid, btrfs_header_bytenr(eb), slot, &vaf); + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf); va_end(args); } @@ -77,7 +77,8 @@ static void generic_err(const struct btrfs_root *root, * offset has its own meaning. */ __printf(4, 5) -static void file_extent_err(const struct btrfs_root *root, +__cold +static void file_extent_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { @@ -91,10 +92,11 @@ static void file_extent_err(const struct btrfs_root *root, vaf.fmt = fmt; vaf.va = &args; - btrfs_crit(root->fs_info, + btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", - btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, - btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, key.offset, &vaf); va_end(args); } @@ -102,26 +104,26 @@ static void file_extent_err(const struct btrfs_root *root, * Return 0 if the btrfs_file_extent_##name is aligned to @alignment * Else return 1 */ -#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \ +#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \ ({ \ if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ - file_extent_err((root), (leaf), (slot), \ + file_extent_err((fs_info), (leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ (alignment)); \ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ }) -static int check_extent_data_item(struct btrfs_root *root, +static int check_extent_data_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_file_extent_item *fi; - u32 sectorsize = root->fs_info->sectorsize; + u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(key->offset, sectorsize)) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; @@ -130,7 +132,7 @@ static int check_extent_data_item(struct btrfs_root *root, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), BTRFS_FILE_EXTENT_TYPES); @@ -142,14 +144,14 @@ static int check_extent_data_item(struct btrfs_root *root, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), BTRFS_COMPRESS_TYPES); return -EUCLEAN; } if (btrfs_file_extent_encryption(leaf, fi)) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); return -EUCLEAN; @@ -157,7 +159,7 @@ static int check_extent_data_item(struct btrfs_root *root, if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ if (key->offset) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); return -EUCLEAN; @@ -171,7 +173,7 @@ static int check_extent_data_item(struct btrfs_root *root, /* Uncompressed inline extent size must match item size */ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)); @@ -182,40 +184,41 @@ static int check_extent_data_item(struct btrfs_root *root, /* Regular or preallocated extent has fixed item size */ if (item_size != sizeof(*fi)) { - file_extent_err(root, leaf, slot, + file_extent_err(fs_info, leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } - if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) || - CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) || - CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) || - CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) || - CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize)) + if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize)) return -EUCLEAN; return 0; } -static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, - struct btrfs_key *key, int slot) +static int check_csum_item(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, struct btrfs_key *key, + int slot) { - u32 sectorsize = root->fs_info->sectorsize; - u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); + u32 sectorsize = fs_info->sectorsize; + u32 csumsize = btrfs_super_csum_size(fs_info->super_copy); if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } if (!IS_ALIGNED(key->offset, sectorsize)) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; @@ -228,7 +231,8 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, * which represents inode number */ __printf(4, 5) -static void dir_item_err(const struct btrfs_root *root, +__cold +static void dir_item_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { @@ -242,14 +246,15 @@ static void dir_item_err(const struct btrfs_root *root, vaf.fmt = fmt; vaf.va = &args; - btrfs_crit(root->fs_info, + btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", - btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, - btrfs_header_bytenr(eb), slot, key.objectid, &vaf); + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); va_end(args); } -static int check_dir_item(struct btrfs_root *root, +static int check_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { @@ -268,7 +273,7 @@ static int check_dir_item(struct btrfs_root *root, /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); return -EUCLEAN; @@ -277,7 +282,7 @@ static int check_dir_item(struct btrfs_root *root, /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "invalid dir item type, have %u expect [0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; @@ -285,14 +290,14 @@ static int check_dir_item(struct btrfs_root *root, if (key->type == BTRFS_XATTR_ITEM_KEY && dir_type != BTRFS_FT_XATTR) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } if (dir_type == BTRFS_FT_XATTR && key->type != BTRFS_XATTR_ITEM_KEY) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; } @@ -305,21 +310,21 @@ static int check_dir_item(struct btrfs_root *root, name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); if (name_len > max_name_len) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } - if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) { - dir_item_err(root, leaf, slot, + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) { + dir_item_err(fs_info, leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, - BTRFS_MAX_XATTR_SIZE(root->fs_info)); + BTRFS_MAX_XATTR_SIZE(fs_info)); return -EUCLEAN; } if (data_len && dir_type != BTRFS_FT_XATTR) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); return -EUCLEAN; @@ -329,7 +334,7 @@ static int check_dir_item(struct btrfs_root *root, /* header and name/data should not cross item boundary */ if (cur + total_size > item_size) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); return -EUCLEAN; @@ -347,7 +352,7 @@ static int check_dir_item(struct btrfs_root *root, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); if (key->offset != name_hash) { - dir_item_err(root, leaf, slot, + dir_item_err(fs_info, leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); return -EUCLEAN; @@ -362,7 +367,7 @@ static int check_dir_item(struct btrfs_root *root, /* * Common point to switch the item-specific validation. */ -static int check_leaf_item(struct btrfs_root *root, +static int check_leaf_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { @@ -370,24 +375,23 @@ static int check_leaf_item(struct btrfs_root *root, switch (key->type) { case BTRFS_EXTENT_DATA_KEY: - ret = check_extent_data_item(root, leaf, key, slot); + ret = check_extent_data_item(fs_info, leaf, key, slot); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(root, leaf, key, slot); + ret = check_csum_item(fs_info, leaf, key, slot); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: - ret = check_dir_item(root, leaf, key, slot); + ret = check_dir_item(fs_info, leaf, key, slot); break; } return ret; } -static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, +static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, bool check_item_data) { - struct btrfs_fs_info *fs_info = root->fs_info; /* No valid key type is 0, so all key should be larger than this key */ struct btrfs_key prev_key = {0, 0, 0}; struct btrfs_key key; @@ -420,7 +424,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, eb = btrfs_root_node(check_root); /* if leaf is the root, then it's fine */ if (leaf != eb) { - generic_err(check_root, leaf, 0, + generic_err(fs_info, leaf, 0, "invalid nritems, have %u should not be 0 for non-root leaf", nritems); free_extent_buffer(eb); @@ -453,7 +457,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, /* Make sure the keys are in the right order */ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, @@ -472,7 +476,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, item_end_expected = btrfs_item_offset_nr(leaf, slot - 1); if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "unexpected item end, have %u expect %u", btrfs_item_end_nr(leaf, slot), item_end_expected); @@ -486,7 +490,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, */ if (btrfs_item_end_nr(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info)) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", btrfs_item_end_nr(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); @@ -496,7 +500,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, /* Also check if the item pointer overlaps with btrfs item. */ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > btrfs_item_ptr_offset(leaf, slot)) { - generic_err(root, leaf, slot, + generic_err(fs_info, leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item), @@ -509,7 +513,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, * Check if the item size and content meet other * criteria */ - ret = check_leaf_item(root, leaf, &key, slot); + ret = check_leaf_item(fs_info, leaf, &key, slot); if (ret < 0) return ret; } @@ -522,18 +526,19 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, return 0; } -int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf) +int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf) { - return check_leaf(root, leaf, true); + return check_leaf(fs_info, leaf, true); } -int btrfs_check_leaf_relaxed(struct btrfs_root *root, +int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf) { - return check_leaf(root, leaf, false); + return check_leaf(fs_info, leaf, false); } -int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) +int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) { unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; @@ -541,12 +546,12 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) u64 bytenr; int ret = 0; - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { - btrfs_crit(root->fs_info, + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { + btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", - root->objectid, node->start, + btrfs_header_owner(node), node->start, nr == 0 ? "small" : "large", nr, - BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)); + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); return -EUCLEAN; } @@ -556,21 +561,21 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) btrfs_node_key_to_cpu(node, &next_key, slot + 1); if (!bytenr) { - generic_err(root, node, slot, + generic_err(fs_info, node, slot, "invalid NULL node pointer"); ret = -EUCLEAN; goto out; } - if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) { - generic_err(root, node, slot, + if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { + generic_err(fs_info, node, slot, "unaligned pointer, have %llu should be aligned to %u", - bytenr, root->fs_info->sectorsize); + bytenr, fs_info->sectorsize); ret = -EUCLEAN; goto out; } if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - generic_err(root, node, slot, + generic_err(fs_info, node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, next_key.objectid, next_key.type, diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3d53e8d6fda0..aba542755710 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -25,14 +25,15 @@ * Will check not only the item pointers, but also every possible member * in item data. */ -int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf); /* * Less strict leaf checker. * Will only check item pointers, not reading item data. */ -int btrfs_check_leaf_relaxed(struct btrfs_root *root, +int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf); -int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); +int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index cb65089127cc..c09dbe4bd6e7 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -39,7 +39,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int level; int next_key_ret = 0; u64 last_ret = 0; - u64 min_trans = 0; if (root->fs_info->extent_root == root) { /* @@ -81,7 +80,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->keep_locks = 1; - ret = btrfs_search_forward(root, &key, path, min_trans); + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret < 0) goto out; if (ret > 0) { @@ -130,7 +129,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, */ path->slots[1] = btrfs_header_nritems(path->nodes[1]); next_key_ret = btrfs_find_next_key(root, path, &key, 1, - min_trans); + BTRFS_OLDEST_GENERATION); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4fd19b4d6675..c91babc6aa4b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -21,12 +21,12 @@ #include <linux/blkdev.h> #include <linux/list_sort.h> #include <linux/iversion.h> +#include "ctree.h" #include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "hash.h" #include "compression.h" #include "qgroup.h" #include "inode-map.h" @@ -286,7 +286,7 @@ struct walk_control { * inside it */ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen); + struct walk_control *wc, u64 gen, int level); }; /* @@ -294,7 +294,7 @@ struct walk_control { */ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen) + struct walk_control *wc, u64 gen, int level) { struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -304,7 +304,7 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen, level, NULL); if (ret) return ret; } @@ -853,7 +853,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_dir_item *di) { - struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; char *name; int name_len; @@ -887,7 +886,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; else - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); out: kfree(name); iput(inode); @@ -967,7 +966,9 @@ static noinline int backref_in_log(struct btrfs_root *log, ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); if (key->type == BTRFS_INODE_EXTREF_KEY) { - if (btrfs_find_name_in_ext_backref(path, ref_objectid, + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name, namelen, NULL)) match = 1; @@ -1005,7 +1006,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, u64 ref_index, char *name, int namelen, int *search_done) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *victim_name; int victim_name_len; @@ -1063,7 +1063,7 @@ again: kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); if (ret) return ret; *search_done = 1; @@ -1134,8 +1134,7 @@ again: victim_name_len); if (!ret) ret = btrfs_run_delayed_items( - trans, - fs_info); + trans); } iput(victim_parent); kfree(victim_name); @@ -1191,7 +1190,8 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, read_extent_buffer(eb, *name, (unsigned long)&extref->name, *namelen); - *index = btrfs_inode_extref_index(eb, extref); + if (index) + *index = btrfs_inode_extref_index(eb, extref); if (parent_objectid) *parent_objectid = btrfs_inode_extref_parent(eb, extref); @@ -1212,12 +1212,102 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); - *index = btrfs_inode_ref_index(eb, ref); + if (index) + *index = btrfs_inode_ref_index(eb, ref); return 0; } /* + * Take an inode reference item from the log tree and iterate all names from the + * inode reference item in the subvolume tree with the same key (if it exists). + * For any name that is not in the inode reference item from the log tree, do a + * proper unlink of that name (that is, remove its entry from the inode + * reference item and both dir index keys). + */ +static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_inode *inode, + struct extent_buffer *log_eb, + int log_slot, + struct btrfs_key *key) +{ + int ret; + unsigned long ref_ptr; + unsigned long ref_end; + struct extent_buffer *eb; + +again: + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) { + ret = 0; + goto out; + } + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); + while (ref_ptr < ref_end) { + char *name = NULL; + int namelen; + u64 parent_id; + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + NULL, &parent_id); + } else { + parent_id = key->offset; + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + NULL); + } + if (ret) + goto out; + + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen, NULL); + else + ret = btrfs_find_name_in_backref(log_eb, log_slot, name, + namelen, NULL); + + if (!ret) { + struct inode *dir; + + btrfs_release_path(path); + dir = read_one_inode(root, parent_id); + if (!dir) { + ret = -ENOENT; + kfree(name); + goto out; + } + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + inode, name, namelen); + kfree(name); + iput(dir); + if (ret) + goto out; + goto again; + } + + kfree(name); + ref_ptr += namelen; + if (key->type == BTRFS_INODE_EXTREF_KEY) + ref_ptr += sizeof(struct btrfs_inode_extref); + else + ref_ptr += sizeof(struct btrfs_inode_ref); + } + ret = 0; + out: + btrfs_release_path(path); + return ret; +} + +/* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. * root is the destination we are replaying into, and path is for temp @@ -1345,6 +1435,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } } + /* + * Before we overwrite the inode reference item in the subvolume tree + * with the item from the log tree, we must unlink all names from the + * parent directory that are in the subvolume's tree inode reference + * item, otherwise we end up with an inconsistent subvolume tree where + * dir index entries exist for a name but there is no inode reference + * item with the same name. + */ + ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, + key); + if (ret) + goto out; + /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); out: @@ -1992,7 +2095,6 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct inode *dir, struct btrfs_key *dir_key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct extent_buffer *eb; int slot; @@ -2056,7 +2158,7 @@ again: ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(inode), name, name_len); if (!ret) - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); kfree(name); iput(inode); if (ret) @@ -2304,17 +2406,16 @@ out: * back refs). */ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen) + struct walk_control *wc, u64 gen, int level) { int nritems; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; - int level; int i; int ret; - ret = btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen, level, NULL); if (ret) return ret; @@ -2431,6 +2532,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level >= BTRFS_MAX_LEVEL); while (*level > 0) { + struct btrfs_key first_key; + WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; @@ -2443,6 +2546,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; parent = path->nodes[*level]; @@ -2453,7 +2557,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return PTR_ERR(next); if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen); + ret = wc->process_func(root, next, wc, ptr_gen, + *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2461,7 +2566,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen, + *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -2491,7 +2597,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -2541,7 +2647,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); + btrfs_header_generation(path->nodes[*level]), + *level); if (ret) return ret; @@ -2623,7 +2730,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { ret = wc->process_func(log, path->nodes[orig_level], wc, - btrfs_header_generation(path->nodes[orig_level])); + btrfs_header_generation(path->nodes[orig_level]), + orig_level); if (ret) goto out; if (wc->free) { @@ -3866,6 +3974,7 @@ fill_holes: ASSERT(ret == 0); src = src_path->nodes[0]; i = 0; + need_find_last_extent = true; } btrfs_item_key_to_cpu(src, &key, i); @@ -3900,6 +4009,36 @@ fill_holes: break; *last_extent = extent_end; } + + /* + * Check if there is a hole between the last extent found in our leaf + * and the first extent in the next leaf. If there is one, we need to + * log an explicit hole so that at replay time we can punch the hole. + */ + if (ret == 0 && + key.objectid == btrfs_ino(inode) && + key.type == BTRFS_EXTENT_DATA_KEY && + i == btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(inode->root, src_path); + need_find_last_extent = true; + if (ret > 0) { + ret = 0; + } else if (ret == 0) { + btrfs_item_key_to_cpu(src_path->nodes[0], &key, + src_path->slots[0]); + if (key.objectid == btrfs_ino(inode) && + key.type == BTRFS_EXTENT_DATA_KEY && + *last_extent < key.offset) { + const u64 len = key.offset - *last_extent; + + ret = btrfs_insert_file_extent(trans, log, + btrfs_ino(inode), + *last_extent, 0, + 0, len, 0, len, + 0, 0, 0); + } + } + } /* * Need to let the callers know we dropped the path so they should * re-search. @@ -5411,7 +5550,6 @@ out: * the last committed transaction */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct dentry *parent, const loff_t start, @@ -5419,6 +5557,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, int inode_only, struct btrfs_log_ctx *ctx) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; struct dentry *old_parent = NULL; @@ -5444,7 +5583,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { + if (btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; } @@ -5576,7 +5715,7 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry, + struct dentry *dentry, const loff_t start, const loff_t end, struct btrfs_log_ctx *ctx) @@ -5584,8 +5723,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), - parent, start, end, LOG_INODE_ALL, ctx); + ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, + start, end, LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -5847,13 +5986,12 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct dentry *parent) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; /* * this will force the logging code to walk the dentry chain * up for the file */ - if (S_ISREG(inode->vfs_inode.i_mode)) + if (!S_ISDIR(inode->vfs_inode.i_mode)) inode->last_unlink_trans = trans->transid; /* @@ -5864,7 +6002,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 0, - LLONG_MAX, LOG_INODE_EXISTS, NULL); + return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 483027f9a7f4..88abc43312a1 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -65,7 +65,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry, + struct dentry *dentry, const loff_t start, const loff_t end, struct btrfs_log_ctx *ctx); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 726f928238d0..9916f03430bc 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -282,7 +282,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, key.offset = 0; again_search_slot: - ret = btrfs_search_forward(root, &key, path, 0); + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret) { if (ret > 0) ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2ceb924ca0d6..93f8f17cacca 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -27,6 +27,7 @@ #include <linux/raid/pq.h> #include <linux/semaphore.h> #include <linux/uuid.h> +#include <linux/list_sort.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -278,7 +279,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev, &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_cleanup_fs_uuids(void) +void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -708,7 +709,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { fs_devices->rw_devices++; - list_add(&device->dev_alloc_list, &fs_devices->alloc_list); + list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } brelse(bh); @@ -895,7 +896,11 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) +/* + * After we have read the system tree and know devids belonging to + * this filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; struct btrfs_device *latest_dev = NULL; @@ -1103,6 +1108,20 @@ out: return ret; } +static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_device *dev1, *dev2; + + dev1 = list_entry(a, struct btrfs_device, dev_list); + dev2 = list_entry(b, struct btrfs_device, dev_list); + + if (dev1->devid < dev2->devid) + return -1; + else if (dev1->devid > dev2->devid) + return 1; + return 0; +} + int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { @@ -1113,6 +1132,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->opened++; ret = 0; } else { + list_sort(NULL, &fs_devices->devices, devid_cmp); ret = __btrfs_open_devices(fs_devices, flags, holder); } mutex_unlock(&uuid_mutex); @@ -1916,12 +1936,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_lock(&uuid_mutex); num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) @@ -2047,7 +2067,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices; - WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + lockdep_assert_held(&fs_info->fs_devices->device_list_mutex); /* * in case of fs with no seed, srcdev->fs_devices will point @@ -2237,7 +2257,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 super_flags; - BUG_ON(!mutex_is_locked(&uuid_mutex)); + lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) return -EINVAL; @@ -2642,7 +2662,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->total_bytes = btrfs_device_get_total_bytes(srcdev); device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); device->bytes_used = btrfs_device_get_bytes_used(srcdev); - ASSERT(list_empty(&srcdev->resized_list)); device->commit_total_bytes = srcdev->commit_total_bytes; device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; @@ -2666,19 +2685,6 @@ error: return ret; } -void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, - struct btrfs_device *tgtdev) -{ - u32 sectorsize = fs_info->sectorsize; - - WARN_ON(fs_info->fs_devices->rw_devices == 0); - tgtdev->io_width = sectorsize; - tgtdev->io_align = sectorsize; - tgtdev->sector_size = sectorsize; - tgtdev->fs_info = fs_info; - set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state); -} - static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { @@ -2984,7 +2990,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ - ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex)); + lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); ret = btrfs_can_relocate(fs_info, chunk_offset); if (ret) @@ -2997,6 +3003,16 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; + /* + * We add the kobjects here (and after forcing data chunk creation) + * since relocation is the only place we'll create chunks of a new + * type at runtime. The only place where we'll remove the last + * chunk of a type is the call immediately below this one. Even + * so, we're protected against races with the cleaner thread since + * we're covered by the delete_unused_bgs_mutex. + */ + btrfs_add_raid_kobjects(fs_info); + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3124,6 +3140,8 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, if (ret < 0) return ret; + btrfs_add_raid_kobjects(fs_info); + return 1; } } @@ -3892,12 +3910,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { BUG_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); @@ -4202,7 +4220,8 @@ static int btrfs_uuid_scan_kthread(void *data) key.offset = 0; while (1) { - ret = btrfs_search_forward(root, &key, path, 0); + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); if (ret) { if (ret > 0) ret = 0; @@ -4672,7 +4691,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \ +#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -4713,10 +4732,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, BUG_ON(!alloc_profile_is_valid(type, 0)); - if (list_empty(&fs_devices->alloc_list)) + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); return -ENOSPC; + } - index = __get_raid_index(type); + index = btrfs_bg_flags_to_raid_index(type); sub_stripes = btrfs_raid_array[index].sub_stripes; dev_stripes = btrfs_raid_array[index].dev_stripes; @@ -4729,7 +4751,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = SZ_1G; max_chunk_size = 10 * max_stripe_size; if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info->chunk_root); + devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) @@ -4738,7 +4760,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info->chunk_root); + devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; @@ -4797,8 +4819,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret == 0) max_avail = max_stripe_size * dev_stripes; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, + "%s: devid %llu has no free space, have=%llu want=%u", + __func__, device->devid, max_avail, + BTRFS_STRIPE_LEN * dev_stripes); continue; + } if (ndevs == fs_devices->rw_devices) { WARN(1, "%s: found more than %llu devices\n", @@ -4821,18 +4849,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, /* round down to number of usable stripes */ ndevs = round_down(ndevs, devs_increment); - if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { + if (ndevs < devs_min) { ret = -ENOSPC; + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ndevs, devs_min); + } goto error; } ndevs = min(ndevs, devs_max); /* - * the primary goal is to maximize the number of stripes, so use as many - * devices as possible, even if the stripes are not maximum sized. + * The primary goal is to maximize the number of stripes, so use as + * many devices as possible, even if the stripes are not maximum sized. + * + * The DUP profile stores more than one stripe per device, the + * max_avail is the total size so we have to adjust. */ - stripe_size = devices_info[ndevs-1].max_avail; + stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); num_stripes = ndevs * dev_stripes; /* @@ -4853,22 +4889,19 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * and compare that answer with the max chunk size */ if (stripe_size * data_stripes > max_chunk_size) { - u64 mask = (1ULL << 24) - 1; - stripe_size = div_u64(max_chunk_size, data_stripes); /* bump the answer up to a 16MB boundary */ - stripe_size = (stripe_size + mask) & ~mask; + stripe_size = round_up(stripe_size, SZ_16M); - /* but don't go higher than the limits we found - * while searching for free extents + /* + * But don't go higher than the limits we found while searching + * for free extents */ - if (stripe_size > devices_info[ndevs-1].max_avail) - stripe_size = devices_info[ndevs-1].max_avail; + stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size); } - stripe_size = div_u64(stripe_size, dev_stripes); - /* align to BTRFS_STRIPE_LEN */ stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); @@ -5067,7 +5100,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { u64 chunk_offset; - ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); + lockdep_assert_held(&fs_info->chunk_mutex); chunk_offset = find_next_chunk(fs_info); return __btrfs_alloc_chunk(trans, chunk_offset, type); } @@ -5208,11 +5241,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); return ret; } @@ -5253,13 +5286,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) } static int find_live_mirror(struct btrfs_fs_info *fs_info, - struct map_lookup *map, int first, int num, - int optimal, int dev_replace_is_ongoing) + struct map_lookup *map, int first, + int dev_replace_is_ongoing) { int i; + int num_stripes; + int preferred_mirror; int tolerance; struct btrfs_device *srcdev; + ASSERT((map->type & + (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + num_stripes = map->sub_stripes; + else + num_stripes = map->num_stripes; + + preferred_mirror = first + current->pid % num_stripes; + if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) @@ -5273,10 +5318,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, * mirror is available */ for (tolerance = 0; tolerance < 2; tolerance++) { - if (map->stripes[optimal].dev->bdev && - (tolerance || map->stripes[optimal].dev != srcdev)) - return optimal; - for (i = first; i < first + num; i++) { + if (map->stripes[preferred_mirror].dev->bdev && + (tolerance || map->stripes[preferred_mirror].dev != srcdev)) + return preferred_mirror; + for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && (tolerance || map->stripes[i].dev != srcdev)) return i; @@ -5286,7 +5331,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ - return optimal; + return preferred_mirror; } static inline int parity_smaller(u64 a, u64 b) @@ -5778,10 +5823,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (!bbio_ret) goto out; - btrfs_dev_replace_lock(dev_replace, 0); + btrfs_dev_replace_read_lock(dev_replace); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (!dev_replace_is_ongoing) - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); else btrfs_dev_replace_set_lock_blocking(dev_replace); @@ -5813,8 +5858,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe_index = mirror_num - 1; else { stripe_index = find_live_mirror(fs_info, map, 0, - map->num_stripes, - current->pid % map->num_stripes, dev_replace_is_ongoing); mirror_num = stripe_index + 1; } @@ -5842,8 +5885,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int old_stripe_index = stripe_index; stripe_index = find_live_mirror(fs_info, map, stripe_index, - map->sub_stripes, stripe_index + - current->pid % map->sub_stripes, dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } @@ -5983,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, out: if (dev_replace_is_ongoing) { btrfs_dev_replace_clear_lock_blocking(dev_replace); - btrfs_dev_replace_unlock(dev_replace, 0); + btrfs_dev_replace_read_unlock(dev_replace); } free_extent_map(em); return ret; @@ -6617,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices; int ret; - BUG_ON(!mutex_is_locked(&uuid_mutex)); + lockdep_assert_held(&uuid_mutex); ASSERT(fsid); fs_devices = fs_info->fs_devices->seed; @@ -7357,20 +7398,20 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) } /* Must be invoked during the transaction commit */ -void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, - struct btrfs_transaction *transaction) +void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; int i; - if (list_empty(&transaction->pending_chunks)) + if (list_empty(&trans->pending_chunks)) return; /* In order to kick the device replace finish process */ mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(em, &transaction->pending_chunks, list) { + list_for_each_entry(em, &trans->pending_chunks, list) { map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 28c28eeadff3..d1fcaea9fef5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -422,7 +422,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, @@ -436,7 +436,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u8 *uuid); int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid); -void btrfs_cleanup_fs_uuids(void); +void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); @@ -476,8 +476,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, - struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); @@ -546,9 +544,30 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, btrfs_dev_stat_set(dev, index, 0); } +/* + * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which + * can be used as index to access btrfs_raid_array[]. + */ +static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID10) + return BTRFS_RAID_RAID10; + else if (flags & BTRFS_BLOCK_GROUP_RAID1) + return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_DUP) + return BTRFS_RAID_DUP; + else if (flags & BTRFS_BLOCK_GROUP_RAID0) + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; + + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); -void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, - struct btrfs_transaction *transaction); +void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans); struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index de7d072c78ef..e1e8177deb5e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -33,7 +33,7 @@ #include "locking.h" -ssize_t __btrfs_getxattr(struct inode *inode, const char *name, +int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) { struct btrfs_dir_item *di; @@ -233,7 +233,7 @@ out: /* * @value: "" makes the attribute to empty, NULL removes it */ -int __btrfs_setxattr(struct btrfs_trans_handle *trans, +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { @@ -374,7 +374,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, const char *name, void *buffer, size_t size) { name = xattr_full_name(handler, name); - return __btrfs_getxattr(inode, name, buffer, size); + return btrfs_getxattr(inode, name, buffer, size); } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, @@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { name = xattr_full_name(handler, name); - return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); + return btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, @@ -448,8 +448,8 @@ static int btrfs_initxattrs(struct inode *inode, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); - err = __btrfs_setxattr(trans, inode, name, - xattr->value, xattr->value_len, 0); + err = btrfs_setxattr(trans, inode, name, xattr->value, + xattr->value_len, 0); kfree(name); if (err < 0) break; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 15fc4743dc70..e215a3212a2a 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -23,13 +23,14 @@ extern const struct xattr_handler *btrfs_xattr_handlers[]; -extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, +int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); -extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/buffer.c b/fs/buffer.c index 9a73924db22f..ec5dd39071e6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1511,7 +1511,7 @@ void block_invalidatepage(struct page *page, unsigned int offset, * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (offset == 0) + if (length == PAGE_SIZE) try_to_release_page(page, 0); out: return; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index e7f16a77a22a..222bc5d8b62c 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -32,7 +32,7 @@ static struct fscache_object *cachefiles_alloc_object( struct cachefiles_cache *cache; struct cachefiles_xattr *auxdata; unsigned keylen, auxlen; - void *buffer; + void *buffer, *p; char *key; cache = container_of(_cache, struct cachefiles_cache, cache); @@ -65,8 +65,12 @@ static struct fscache_object *cachefiles_alloc_object( if (!buffer) goto nomem_buffer; - keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512); - ASSERTCMP(keylen, <, 512); + keylen = cookie->key_len; + if (keylen <= sizeof(cookie->inline_key)) + p = cookie->inline_key; + else + p = cookie->key; + memcpy(buffer + 2, p, keylen); *(uint16_t *)buffer = keylen; ((char *)buffer)[keylen + 2] = 0; @@ -80,15 +84,17 @@ static struct fscache_object *cachefiles_alloc_object( /* get hold of the auxiliary data and prepend the object type */ auxdata = buffer; - auxlen = 0; - if (cookie->def->get_aux) { - auxlen = cookie->def->get_aux(cookie->netfs_data, - auxdata->data, 511); - ASSERTCMP(auxlen, <, 511); + auxlen = cookie->aux_len; + if (auxlen) { + if (auxlen <= sizeof(cookie->inline_aux)) + p = cookie->inline_aux; + else + p = cookie->aux; + memcpy(auxdata->data, p, auxlen); } auxdata->len = auxlen + 1; - auxdata->type = cookie->def->type; + auxdata->type = cookie->type; lookup_data->auxdata = auxdata; lookup_data->key = key; @@ -177,10 +183,12 @@ static void cachefiles_lookup_complete(struct fscache_object *_object) * increment the usage count on an inode object (may fail if unmounting) */ static -struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) +struct fscache_object *cachefiles_grab_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object = container_of(_object, struct cachefiles_object, fscache); + int u; _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); @@ -188,7 +196,9 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); #endif - atomic_inc(&object->usage); + u = atomic_inc_return(&object->usage); + trace_cachefiles_ref(object, _object->cookie, + (enum cachefiles_obj_ref_trace)why, u); return &object->fscache; } @@ -202,6 +212,7 @@ static void cachefiles_update_object(struct fscache_object *_object) struct cachefiles_cache *cache; struct fscache_cookie *cookie; const struct cred *saved_cred; + const void *aux; unsigned auxlen; _enter("{OBJ%x}", _object->debug_id); @@ -216,26 +227,29 @@ static void cachefiles_update_object(struct fscache_object *_object) } cookie = object->fscache.cookie; + auxlen = cookie->aux_len; - if (!cookie->def->get_aux) { + if (!auxlen) { fscache_unuse_cookie(_object); _leave(" [no aux]"); return; } - auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); + auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp); if (!auxdata) { fscache_unuse_cookie(_object); _leave(" [nomem]"); return; } - auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + aux = (auxlen <= sizeof(cookie->inline_aux)) ? + cookie->inline_aux : cookie->aux; + + memcpy(auxdata->data, aux, auxlen); fscache_unuse_cookie(_object); - ASSERTCMP(auxlen, <, 511); auxdata->len = auxlen + 1; - auxdata->type = cookie->def->type; + auxdata->type = cookie->type; cachefiles_begin_secure(cache, &saved_cred); cachefiles_update_object_xattr(object, auxdata); @@ -309,10 +323,12 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* * dispose of a reference to an object */ -static void cachefiles_put_object(struct fscache_object *_object) +static void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object; struct fscache_cache *cache; + int u; ASSERT(_object); @@ -328,7 +344,11 @@ static void cachefiles_put_object(struct fscache_object *_object) ASSERTIFCMP(object->fscache.parent, object->fscache.parent->n_children, >, 0); - if (atomic_dec_and_test(&object->usage)) { + u = atomic_dec_return(&object->usage); + trace_cachefiles_ref(object, _object->cookie, + (enum cachefiles_obj_ref_trace)why, u); + ASSERTCMP(u, !=, -1); + if (u == 0) { _debug("- kill object OBJ%x", object->fscache.debug_id); ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); @@ -421,7 +441,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) loff_t oi_size; int ret; - _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size); + ni_size = _object->store_limit_l; _enter("{OBJ%x},[%llu]", _object->debug_id, (unsigned long long) ni_size); @@ -493,8 +513,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op) cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); - op->object->cookie->def->get_attr(op->object->cookie->netfs_data, - &ni_size); + ni_size = op->object->store_limit_l; _enter("{OBJ%x},[%llu]", op->object->debug_id, (unsigned long long)ni_size); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index bb3a02ca9da4..d2f6f996e65a 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -124,6 +124,8 @@ struct cachefiles_xattr { uint8_t data[]; }; +#include <trace/events/cachefiles.h> + /* * note change of state for daemon */ diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 711f13d8c2de..f54d3f5b2e40 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -22,6 +22,7 @@ #include <linux/statfs.h> #include <linux/sysctl.h> #include <linux/miscdevice.h> +#define CREATE_TRACE_POINTS #include "internal.h" unsigned cachefiles_debug; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3978b324cbca..0daa1e3fe0df 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -30,11 +30,11 @@ */ static noinline void __cachefiles_printk_object(struct cachefiles_object *object, - const char *prefix, - u8 *keybuf) + const char *prefix) { struct fscache_cookie *cookie; - unsigned keylen, loop; + const u8 *k; + unsigned loop; pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", @@ -56,23 +56,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object, object->fscache.cookie->parent, object->fscache.cookie->netfs_data, object->fscache.cookie->flags); - if (keybuf && cookie->def) - keylen = cookie->def->get_key(cookie->netfs_data, keybuf, - CACHEFILES_KEYBUF_SIZE); - else - keylen = 0; + pr_err("%skey=[%u] '", prefix, cookie->key_len); + k = (cookie->key_len <= sizeof(cookie->inline_key)) ? + cookie->inline_key : cookie->key; + for (loop = 0; loop < cookie->key_len; loop++) + pr_cont("%02x", k[loop]); + pr_cont("'\n"); } else { pr_err("%scookie=NULL\n", prefix); - keylen = 0; } spin_unlock(&object->fscache.lock); - - if (keylen) { - pr_err("%skey=[%u] '", prefix, keylen); - for (loop = 0; loop < keylen; loop++) - pr_cont("%02x", keybuf[loop]); - pr_cont("'\n"); - } } /* @@ -81,14 +74,10 @@ void __cachefiles_printk_object(struct cachefiles_object *object, static noinline void cachefiles_printk_object(struct cachefiles_object *object, struct cachefiles_object *xobject) { - u8 *keybuf; - - keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); if (object) - __cachefiles_printk_object(object, "", keybuf); + __cachefiles_printk_object(object, ""); if (xobject) - __cachefiles_printk_object(xobject, "x", keybuf); - kfree(keybuf); + __cachefiles_printk_object(xobject, "x"); } /* @@ -120,6 +109,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, } write_unlock(&cache->active_lock); + trace_cachefiles_mark_buried(NULL, dentry, why); _leave(" [no owner]"); return; @@ -130,6 +120,8 @@ found_dentry: object->fscache.state->name, dentry); + trace_cachefiles_mark_buried(object, dentry, why); + if (fscache_object_is_live(&object->fscache)) { pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); @@ -158,13 +150,15 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache, try_again: write_lock(&cache->active_lock); + dentry = object->dentry; + trace_cachefiles_mark_active(object, dentry); + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { pr_err("Error: Object already active\n"); cachefiles_printk_object(object, NULL); BUG(); } - dentry = object->dentry; _p = &cache->active_nodes.rb_node; while (*_p) { _parent = *_p; @@ -191,6 +185,8 @@ try_again: /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ wait_for_old_object: + trace_cachefiles_wait_active(object, dentry, xobject); + if (fscache_object_is_live(&xobject->fscache)) { pr_err("\n"); pr_err("Error: Unexpected object collision\n"); @@ -248,12 +244,12 @@ wait_for_old_object: ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); - cache->cache.ops->put_object(&xobject->fscache); + cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry); goto try_again; requeue: clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - cache->cache.ops->put_object(&xobject->fscache); + cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo); _leave(" = -ETIMEDOUT"); return -ETIMEDOUT; } @@ -265,6 +261,11 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, struct cachefiles_object *object, blkcnt_t i_blocks) { + struct dentry *dentry = object->dentry; + struct inode *inode = d_backing_inode(dentry); + + trace_cachefiles_mark_inactive(object, dentry, inode); + write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -288,6 +289,7 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, * - unlocks the directory mutex */ static int cachefiles_bury_object(struct cachefiles_cache *cache, + struct cachefiles_object *object, struct dentry *dir, struct dentry *rep, bool preemptive, @@ -312,6 +314,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { + trace_cachefiles_unlink(object, rep, why); ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) @@ -413,6 +416,7 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { + trace_cachefiles_rename(object, rep, grave, why); ret = vfs_rename(d_inode(dir), rep, d_inode(cache->graveyard), grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) @@ -458,7 +462,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* we need to check that our parent is _still_ our parent - it * may have been renamed */ if (dir == object->dentry->d_parent) { - ret = cachefiles_bury_object(cache, dir, + ret = cachefiles_bury_object(cache, object, dir, object->dentry, false, FSCACHE_OBJECT_WAS_RETIRED); } else { @@ -486,6 +490,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, { struct cachefiles_cache *cache; struct dentry *dir, *next = NULL; + struct inode *inode; struct path path; unsigned long start; const char *name; @@ -529,13 +534,17 @@ lookup_again: start = jiffies; next = lookup_one_len(name, dir, nlen); cachefiles_hist(cachefiles_lookup_histogram, start); - if (IS_ERR(next)) + if (IS_ERR(next)) { + trace_cachefiles_lookup(object, next, NULL); goto lookup_error; + } - _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative"); + inode = d_backing_inode(next); + trace_cachefiles_lookup(object, next, inode); + _debug("next -> %p %s", next, inode ? "positive" : "negative"); if (!key) - object->new = !d_backing_inode(next); + object->new = !inode; /* if this element of the path doesn't exist, then the lookup phase * failed, and we can release any readers in the certain knowledge that @@ -558,6 +567,8 @@ lookup_again: start = jiffies; ret = vfs_mkdir(d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); + if (!key) + trace_cachefiles_mkdir(object, next, ret); if (ret < 0) goto create_error; @@ -587,6 +598,7 @@ lookup_again: start = jiffies; ret = vfs_create(d_inode(dir), next, S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); + trace_cachefiles_create(object, next, ret); if (ret < 0) goto create_error; @@ -629,7 +641,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true, + ret = cachefiles_bury_object(cache, object, dir, next, + true, FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -955,7 +968,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false, + ret = cachefiles_bury_object(cache, NULL, dir, victim, false, FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 883bc7bb12c5..5082c8a49686 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -952,6 +952,7 @@ error: * - cache withdrawal is prevented by the caller */ void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) + __releases(&object->fscache.cookie->lock) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index d31c1a72d8a5..0a29a00aed2e 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -113,6 +113,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object, /* attempt to install the cache metadata directly */ _debug("SET #%u", auxdata->len); + clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, XATTR_CREATE); @@ -141,6 +142,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, /* attempt to install the cache metadata directly */ _debug("SET #%u", auxdata->len); + clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, XATTR_REPLACE); @@ -180,7 +182,8 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) goto error; xlen--; - validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); + validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen, + i_size_read(d_backing_inode(dentry))); if (validity != FSCACHE_CHECKAUX_OKAY) goto error; @@ -249,7 +252,8 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, object->fscache.cookie->def->name, dlen); result = fscache_check_aux(&object->fscache, - &auxbuf->data, dlen); + &auxbuf->data, dlen, + i_size_read(d_backing_inode(dentry))); switch (result) { /* entry okay as is */ diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a3ab265d3215..33a211b364ed 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -27,7 +27,6 @@ struct ceph_aux_inode { u64 version; struct timespec mtime; - loff_t size; }; struct fscache_netfs ceph_cache_netfs = { @@ -41,34 +40,15 @@ static LIST_HEAD(ceph_fscache_list); struct ceph_fscache_entry { struct list_head list; struct fscache_cookie *fscache; - struct ceph_fsid fsid; size_t uniq_len; + /* The following members must be last */ + struct ceph_fsid fsid; char uniquifier[0]; }; -static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct ceph_fs_client* fsc = cookie_netfs_data; - const char *fscache_uniq = fsc->mount_options->fscache_uniq; - uint16_t fsid_len, uniq_len; - - fsid_len = sizeof(fsc->client->fsid); - uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; - if (fsid_len + uniq_len > maxbuf) - return 0; - - memcpy(buffer, &fsc->client->fsid, fsid_len); - if (uniq_len) - memcpy(buffer + fsid_len, fscache_uniq, uniq_len); - - return fsid_len + uniq_len; -} - static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { .name = "CEPH.fsid", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = ceph_fscache_session_get_key, }; int ceph_fscache_register(void) @@ -110,16 +90,19 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) goto out_unlock; } + memcpy(&ent->fsid, fsid, sizeof(*fsid)); + if (uniq_len > 0) { + memcpy(&ent->uniquifier, fscache_uniq, uniq_len); + ent->uniq_len = uniq_len; + } + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, - fsc, true); + &ent->fsid, sizeof(ent->fsid) + uniq_len, + NULL, 0, + fsc, 0, true); if (fsc->fscache) { - memcpy(&ent->fsid, fsid, sizeof(*fsid)); - if (uniq_len > 0) { - memcpy(&ent->uniquifier, fscache_uniq, uniq_len); - ent->uniq_len = uniq_len; - } ent->fscache = fsc->fscache; list_add_tail(&ent->list, &ceph_fscache_list); } else { @@ -133,59 +116,21 @@ out_unlock: return err; } -static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct ceph_inode_info* ci = cookie_netfs_data; - uint16_t klen; - - /* use ceph virtual inode (id + snapshot) */ - klen = sizeof(ci->i_vino); - if (klen > maxbuf) - return 0; - - memcpy(buffer, &ci->i_vino, klen); - return klen; -} - -static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct ceph_aux_inode aux; - const struct ceph_inode_info* ci = cookie_netfs_data; - const struct inode* inode = &ci->vfs_inode; - - memset(&aux, 0, sizeof(aux)); - aux.version = ci->i_version; - aux.mtime = inode->i_mtime; - aux.size = i_size_read(inode); - - memcpy(buffer, &aux, sizeof(aux)); - - return sizeof(aux); -} - -static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct ceph_inode_info* ci = cookie_netfs_data; - *size = i_size_read(&ci->vfs_inode); -} - static enum fscache_checkaux ceph_fscache_inode_check_aux( - void *cookie_netfs_data, const void *data, uint16_t dlen) + void *cookie_netfs_data, const void *data, uint16_t dlen, + loff_t object_size) { struct ceph_aux_inode aux; struct ceph_inode_info* ci = cookie_netfs_data; struct inode* inode = &ci->vfs_inode; - if (dlen != sizeof(aux)) + if (dlen != sizeof(aux) || + i_size_read(inode) != object_size) return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; aux.mtime = inode->i_mtime; - aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -197,9 +142,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .name = "CEPH.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = ceph_fscache_inode_get_key, - .get_attr = ceph_fscache_inode_get_attr, - .get_aux = ceph_fscache_inode_get_aux, .check_aux = ceph_fscache_inode_check_aux, }; @@ -207,6 +149,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_aux_inode aux; /* No caching for filesystem */ if (!fsc->fscache) @@ -218,9 +161,14 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) inode_lock_nested(inode, I_MUTEX_CHILD); if (!ci->fscache) { + memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; + aux.mtime = inode->i_mtime; ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, false); + &ceph_fscache_inode_object_def, + &ci->i_vino, sizeof(ci->i_vino), + &aux, sizeof(aux), + ci, i_size_read(inode), false); } inode_unlock(inode); } @@ -235,7 +183,7 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) ci->fscache = NULL; fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); - fscache_relinquish_cookie(cookie, 0); + fscache_relinquish_cookie(cookie, &ci->i_vino, false); } static bool ceph_fscache_can_enable(void *data) @@ -254,11 +202,11 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) if (inode_is_open_for_write(inode)) { dout("fscache_file_set_cookie %p %p disabling cache\n", inode, filp); - fscache_disable_cookie(ci->fscache, false); + fscache_disable_cookie(ci->fscache, &ci->i_vino, false); fscache_uncache_all_inode_pages(ci->fscache, inode); } else { - fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, - inode); + fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), + ceph_fscache_can_enable, inode); if (fscache_cookie_enabled(ci->fscache)) { dout("fscache_file_set_cookie %p %p enabling cache\n", inode, filp); @@ -351,7 +299,8 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page) if (!cache_valid(ci)) return; - ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + ret = fscache_write_page(ci->fscache, page, i_size_read(inode), + GFP_KERNEL); if (ret) fscache_uncache_page(ci->fscache, page); } @@ -385,7 +334,7 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) WARN_ON_ONCE(!found); mutex_unlock(&ceph_fscache_lock); - __fscache_relinquish_cookie(fsc->fscache, 0); + __fscache_relinquish_cookie(fsc->fscache, NULL, false); } fsc->fscache = NULL; } @@ -402,7 +351,7 @@ void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) * truncate while the caller holds CEPH_CAP_FILE_RD */ mutex_lock(&ci->i_truncate_mutex); if (!cache_valid(ci)) { - if (fscache_check_consistency(ci->fscache)) + if (fscache_check_consistency(ci->fscache, &ci->i_vino)) fscache_invalidate(ci->fscache); spin_lock(&ci->i_ceph_lock); ci->i_fscache_gen = ci->i_rdcache_gen; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6582c4507e6c..0e5bd3e3344e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3965,6 +3965,32 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) } /* + * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +int ceph_drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&ci->i_ceph_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + + ci->i_ceph_flags |= CEPH_I_NODELAY; + if (__ceph_caps_dirty(ci)) { + struct ceph_mds_client *mdsc = + ceph_inode_to_client(inode)->mdsc; + __cap_delay_requeue_front(mdsc, ci); + } + } + spin_unlock(&ci->i_ceph_lock); + return drop; +} + +/* * Helpers for embedding cap and dentry lease releases into mds * requests. * diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0c4346806e17..2bdd561c4c68 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2,7 +2,6 @@ #include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> -#include <linux/fs_struct.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/sched.h> @@ -1003,26 +1002,6 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } /* - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it - * looks like the link count will hit 0, drop any other caps (other - * than PIN) we don't specifically want (due to the file still being - * open). - */ -static int drop_caps_for_unlink(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - - spin_lock(&ci->i_ceph_lock); - if (inode->i_nlink == 1) { - drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); - ci->i_ceph_flags |= CEPH_I_NODELAY; - } - spin_unlock(&ci->i_ceph_lock); - return drop; -} - -/* * rmdir and unlink are differ only by the metadata op code */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) @@ -1056,7 +1035,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - req->r_inode_drop = drop_caps_for_unlink(inode); + req->r_inode_drop = ceph_drop_caps_for_unlink(inode); err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) d_delete(dentry); @@ -1104,8 +1083,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - if (d_really_is_positive(new_dentry)) - req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); + if (d_really_is_positive(new_dentry)) { + req->r_inode_drop = + ceph_drop_caps_for_unlink(d_inode(new_dentry)); + } err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6639926eed4e..b67eec3532a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -640,7 +640,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, struct ceph_aio_request { struct kiocb *iocb; size_t total_len; - int write; + bool write; + bool should_dirty; int error; struct list_head osd_reqs; unsigned num_reqs; @@ -750,7 +751,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } - ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write); + ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); ceph_osdc_put_request(req); if (rc < 0) @@ -847,6 +848,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; + bool should_dirty = !write && iter_is_iovec(iter); if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -914,6 +916,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (aio_req) { aio_req->iocb = iocb; aio_req->write = write; + aio_req->should_dirty = should_dirty; INIT_LIST_HEAD(&aio_req->osd_reqs); if (write) { aio_req->mtime = mtime; @@ -971,7 +974,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, len = ret; } - ceph_put_page_vector(pages, num_pages, !write); + ceph_put_page_vector(pages, num_pages, should_dirty); ceph_osdc_put_request(req); if (ret < 0) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a62d2a9841dc..fb2bc9c15a23 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -225,6 +225,7 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; break; case Opt_mds_namespace: + kfree(fsopt->mds_namespace); fsopt->mds_namespace = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -232,6 +233,7 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; break; case Opt_fscache_uniq: + kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -711,14 +713,17 @@ static int __init init_caches(void) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); - if (!ceph_file_cachep) goto bad_file; - if ((error = ceph_fscache_register())) - goto bad_file; + error = ceph_fscache_register(); + if (error) + goto bad_fscache; return 0; + +bad_fscache: + kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: @@ -836,7 +841,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) int err; unsigned long started = jiffies; /* note the start time */ struct dentry *root; - int first = 0; /* first vfsmount for this super_block */ dout("mount start %p\n", fsc); mutex_lock(&fsc->client->mount_mutex); @@ -861,17 +865,17 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) path = fsc->mount_options->server_path + 1; dout("mount opening path %s\n", path); } + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto out; + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } fsc->sb->s_root = dget(root); - first = 1; - - err = ceph_fs_debugfs_init(fsc); - if (err < 0) - goto fail; } else { root = dget(fsc->sb->s_root); } @@ -881,11 +885,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) mutex_unlock(&fsc->client->mount_mutex); return root; -fail: - if (first) { - dput(fsc->sb->s_root); - fsc->sb->s_root = NULL; - } out: mutex_unlock(&fsc->client->mount_mutex); return ERR_PTR(err); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 21b2e5b004eb..1c2086e0fec2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -987,7 +987,7 @@ extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); - +extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, diff --git a/fs/char_dev.c b/fs/char_dev.c index a65e4a56318c..a279c58fe360 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -67,18 +67,18 @@ static int find_dynamic_major(void) int i; struct char_device_struct *cd; - for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) { + for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; - i > CHRDEV_MAJOR_DYN_EXT_END; i--) { + i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; - if (cd == NULL || cd->major != i) + if (cd == NULL) return i; } @@ -121,8 +121,8 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, } if (major >= CHRDEV_MAJOR_MAX) { - pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n", - name, major, CHRDEV_MAJOR_MAX); + pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", + name, major, CHRDEV_MAJOR_MAX-1); ret = -EINVAL; goto out; } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 687da62daf4e..741749a98614 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -187,13 +187,13 @@ config CIFS_NFSD_EXPORT Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB311 - bool "SMB3.1.1 network file system support (Experimental)" + bool "SMB3.1.1 network file system support" depends on CIFS + select CRYPTO_SHA512 help - This enables experimental support for the newest, SMB3.1.1, dialect. - This dialect includes improved security negotiation features. - If unsure, say N + This enables support for the newest, and most secure dialect, SMB3.11. + If unsure, say Y config CIFS_SMB_DIRECT bool "SMB Direct support (Experimental)" diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 2c14020e5e1d..edf5f40898bf 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -46,67 +46,11 @@ void cifs_fscache_unregister(void) } /* - * Key layout of CIFS server cache index object - */ -struct cifs_server_key { - uint16_t family; /* address family */ - __be16 port; /* IP port */ - union { - struct in_addr ipv4_addr; - struct in6_addr ipv6_addr; - } addr[0]; -}; - -/* - * Server object keyed by {IPaddress,port,family} tuple - */ -static uint16_t cifs_server_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct TCP_Server_Info *server = cookie_netfs_data; - const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; - const struct sockaddr_in *addr = (struct sockaddr_in *) sa; - const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; - struct cifs_server_key *key = buffer; - uint16_t key_len = sizeof(struct cifs_server_key); - - memset(key, 0, key_len); - - /* - * Should not be a problem as sin_family/sin6_family overlays - * sa_family field - */ - switch (sa->sa_family) { - case AF_INET: - key->family = sa->sa_family; - key->port = addr->sin_port; - key->addr[0].ipv4_addr = addr->sin_addr; - key_len += sizeof(key->addr[0].ipv4_addr); - break; - - case AF_INET6: - key->family = sa->sa_family; - key->port = addr6->sin6_port; - key->addr[0].ipv6_addr = addr6->sin6_addr; - key_len += sizeof(key->addr[0].ipv6_addr); - break; - - default: - cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); - key_len = 0; - break; - } - - return key_len; -} - -/* * Server object for FS-Cache */ const struct fscache_cookie_def cifs_fscache_server_index_def = { .name = "CIFS.server", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = cifs_server_get_key, }; /* @@ -116,7 +60,7 @@ struct cifs_fscache_super_auxdata { u64 resource_id; /* unique server resource id */ }; -static char *extract_sharename(const char *treename) +char *extract_sharename(const char *treename) { const char *src; char *delim, *dst; @@ -140,56 +84,11 @@ static char *extract_sharename(const char *treename) return dst; } -/* - * Superblock object currently keyed by share name - */ -static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, - uint16_t maxbuf) -{ - const struct cifs_tcon *tcon = cookie_netfs_data; - char *sharename; - uint16_t len; - - sharename = extract_sharename(tcon->treeName); - if (IS_ERR(sharename)) { - cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - sharename = NULL; - return 0; - } - - len = strlen(sharename); - if (len > maxbuf) - return 0; - - memcpy(buffer, sharename, len); - - kfree(sharename); - - return len; -} - -static uint16_t -cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, - uint16_t maxbuf) -{ - struct cifs_fscache_super_auxdata auxdata; - const struct cifs_tcon *tcon = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - - if (maxbuf > sizeof(auxdata)) - maxbuf = sizeof(auxdata); - - memcpy(buffer, &auxdata, maxbuf); - - return maxbuf; -} - static enum fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct cifs_fscache_super_auxdata auxdata; const struct cifs_tcon *tcon = cookie_netfs_data; @@ -212,68 +111,14 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, const struct fscache_cookie_def cifs_fscache_super_index_def = { .name = "CIFS.super", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = cifs_super_get_key, - .get_aux = cifs_fscache_super_get_aux, .check_aux = cifs_fscache_super_check_aux, }; -/* - * Auxiliary data attached to CIFS inode within the cache - */ -struct cifs_fscache_inode_auxdata { - struct timespec last_write_time; - struct timespec last_change_time; - u64 eof; -}; - -static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct cifsInodeInfo *cifsi = cookie_netfs_data; - uint16_t keylen; - - /* use the UniqueId as the key */ - keylen = sizeof(cifsi->uniqueid); - if (keylen > maxbuf) - keylen = 0; - else - memcpy(buffer, &cifsi->uniqueid, keylen); - - return keylen; -} - -static void -cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) -{ - const struct cifsInodeInfo *cifsi = cookie_netfs_data; - - *size = cifsi->vfs_inode.i_size; -} - -static uint16_t -cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, - uint16_t maxbuf) -{ - struct cifs_fscache_inode_auxdata auxdata; - const struct cifsInodeInfo *cifsi = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time = cifsi->vfs_inode.i_mtime; - auxdata.last_change_time = cifsi->vfs_inode.i_ctime; - - if (maxbuf > sizeof(auxdata)) - maxbuf = sizeof(auxdata); - - memcpy(buffer, &auxdata, maxbuf); - - return maxbuf; -} - static enum fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = cookie_netfs_data; @@ -295,8 +140,5 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, const struct fscache_cookie_def cifs_fscache_inode_object_def = { .name = "CIFS.uniqueid", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = cifs_fscache_inode_get_key, - .get_attr = cifs_fscache_inode_get_attr, - .get_aux = cifs_fscache_inode_get_aux, .check_aux = cifs_fscache_inode_check_aux, }; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index f2b0a7f124da..a6ef088e057b 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -36,37 +36,6 @@ #include <crypto/skcipher.h> #include <crypto/aead.h> -static int -cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) -{ - int rc; - unsigned int size; - - if (server->secmech.sdescmd5 != NULL) - return 0; /* already allocated */ - - server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(server->secmech.md5)) { - cifs_dbg(VFS, "could not allocate crypto md5\n"); - rc = PTR_ERR(server->secmech.md5); - server->secmech.md5 = NULL; - return rc; - } - - size = sizeof(struct shash_desc) + - crypto_shash_descsize(server->secmech.md5); - server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); - if (!server->secmech.sdescmd5) { - crypto_free_shash(server->secmech.md5); - server->secmech.md5 = NULL; - return -ENOMEM; - } - server->secmech.sdescmd5->shash.tfm = server->secmech.md5; - server->secmech.sdescmd5->shash.flags = 0x0; - - return 0; -} - int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature, struct shash_desc *shash) @@ -132,13 +101,10 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (!rqst->rq_iov || !signature || !server) return -EINVAL; - if (!server->secmech.sdescmd5) { - rc = cifs_crypto_shash_md5_allocate(server); - if (rc) { - cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__); - return -1; - } - } + rc = cifs_alloc_hash("md5", &server->secmech.md5, + &server->secmech.sdescmd5); + if (rc) + return -1; rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { @@ -663,37 +629,6 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return rc; } -static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server) -{ - int rc; - unsigned int size; - - /* check if already allocated */ - if (server->secmech.sdeschmacmd5) - return 0; - - server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); - if (IS_ERR(server->secmech.hmacmd5)) { - cifs_dbg(VFS, "could not allocate crypto hmacmd5\n"); - rc = PTR_ERR(server->secmech.hmacmd5); - server->secmech.hmacmd5 = NULL; - return rc; - } - - size = sizeof(struct shash_desc) + - crypto_shash_descsize(server->secmech.hmacmd5); - server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); - if (!server->secmech.sdeschmacmd5) { - crypto_free_shash(server->secmech.hmacmd5); - server->secmech.hmacmd5 = NULL; - return -ENOMEM; - } - server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; - server->secmech.sdeschmacmd5->shash.flags = 0x0; - - return 0; -} - int setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -757,9 +692,10 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) mutex_lock(&ses->server->srv_mutex); - rc = crypto_hmacmd5_alloc(ses->server); + rc = cifs_alloc_hash("hmac(md5)", + &ses->server->secmech.hmacmd5, + &ses->server->secmech.sdeschmacmd5); if (rc) { - cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc); goto unlock; } @@ -893,6 +829,11 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) server->secmech.md5 = NULL; } + if (server->secmech.sha512) { + crypto_free_shash(server->secmech.sha512); + server->secmech.sha512 = NULL; + } + if (server->secmech.hmacmd5) { crypto_free_shash(server->secmech.hmacmd5); server->secmech.hmacmd5 = NULL; @@ -916,4 +857,6 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) server->secmech.sdeschmacmd5 = NULL; kfree(server->secmech.sdescmd5); server->secmech.sdescmd5 = NULL; + kfree(server->secmech.sdescsha512); + server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 32cdea67bbfd..f715609b13f3 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1486,6 +1486,7 @@ MODULE_SOFTDEP("pre: nls"); MODULE_SOFTDEP("pre: aes"); MODULE_SOFTDEP("pre: cmac"); MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: sha512"); MODULE_SOFTDEP("pre: aead2"); MODULE_SOFTDEP("pre: ccm"); module_init(init_cifs) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 48f7c197cd2d..2282562e78a1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -130,10 +130,12 @@ struct cifs_secmech { struct crypto_shash *md5; /* md5 hash function */ struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ struct crypto_shash *cmacaes; /* block-cipher based MAC function */ + struct crypto_shash *sha512; /* sha512 hash function */ struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ + struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */ struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */ struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */ }; @@ -466,6 +468,7 @@ struct smb_version_values { __u32 exclusive_lock_type; __u32 shared_lock_type; __u32 unlock_lock_type; + size_t header_preamble_size; size_t header_size; size_t max_header_size; size_t read_rsp_size; @@ -673,7 +676,8 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; #ifdef CONFIG_CIFS_SMB311 - __u8 preauth_sha_hash[64]; /* save initital negprot hash */ + /* save initital negprot hash */ + __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ struct delayed_work reconnect; /* reconnect workqueue job */ struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ @@ -862,7 +866,7 @@ struct cifs_ses { __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; #ifdef CONFIG_CIFS_SMB311 - __u8 preauth_sha_hash[64]; + __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ }; @@ -1466,6 +1470,7 @@ struct dfs_info3_param { #define CIFS_FATTR_NEED_REVAL 0x4 #define CIFS_FATTR_INO_COLLISION 0x8 #define CIFS_FATTR_UNKNOWN_NLINK 0x10 +#define CIFS_FATTR_FAKE_ROOT_INO 0x20 struct cifs_fattr { u32 cf_flags; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 93d565186698..365a414a75e9 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -542,4 +542,9 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); + +int cifs_alloc_hash(const char *name, struct crypto_shash **shash, + struct sdesc **sdesc); +void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); + #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 9ceebf30eb22..59c09a596c0a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1454,7 +1454,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) unsigned int data_offset, data_len; struct cifs_readdata *rdata = mid->callback_data; char *buf = server->smallbuf; - unsigned int buflen = get_rfc1002_length(buf) + 4; + unsigned int buflen = get_rfc1002_length(buf) + + server->vals->header_preamble_size; bool use_rdma_mr = false; cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", @@ -1504,7 +1505,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); } - data_offset = server->ops->read_data_offset(buf) + 4; + data_offset = server->ops->read_data_offset(buf) + + server->vals->header_preamble_size; if (data_offset < server->total_read) { /* * win2k8 sometimes sends an offset of 0 when the read diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a726f524fb84..4e0808f40195 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -775,7 +775,8 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) unsigned int pdu_length = get_rfc1002_length(buf); /* make sure this will fit in a large buffer */ - if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) { + if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - + server->vals->header_preamble_size) { cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); @@ -791,7 +792,9 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* now read the rest */ length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, - pdu_length - HEADER_SIZE(server) + 1 + 4); + pdu_length - HEADER_SIZE(server) + 1 + + server->vals->header_preamble_size); + if (length < 0) return length; server->total_read += length; @@ -884,7 +887,8 @@ cifs_demultiplex_thread(void *p) continue; /* make sure we have enough to get to the MID */ - if (pdu_length < HEADER_SIZE(server) - 1 - 4) { + if (pdu_length < HEADER_SIZE(server) - 1 - + server->vals->header_preamble_size) { cifs_dbg(VFS, "SMB response too short (%u bytes)\n", pdu_length); cifs_reconnect(server); @@ -893,8 +897,10 @@ cifs_demultiplex_thread(void *p) } /* read down to the MID */ - length = cifs_read_from_socket(server, buf + 4, - HEADER_SIZE(server) - 1 - 4); + length = cifs_read_from_socket(server, + buf + server->vals->header_preamble_size, + HEADER_SIZE(server) - 1 + - server->vals->header_preamble_size); if (length < 0) continue; server->total_read += length; @@ -4306,7 +4312,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, server->sec_mode, server->capabilities, server->timeAdj); if (ses->auth_key.response) { - cifs_dbg(VFS, "Free previous auth_key.response = %p\n", + cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); kfree(ses->auth_key.response); ses->auth_key.response = NULL; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 8d4b7bc8ae91..25d3f66b2d50 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -23,11 +23,63 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + struct { + uint16_t family; /* address family */ + __be16 port; /* IP port */ + } hdr; + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + }; +} __packed; + +/* + * Get a cookie for a server object keyed by {IPaddress,port,family} tuple + */ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) { + const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; + const struct sockaddr_in *addr = (struct sockaddr_in *) sa; + const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; + struct cifs_server_key key; + uint16_t key_len = sizeof(key.hdr); + + memset(&key, 0, sizeof(key)); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + key.hdr.family = sa->sa_family; + switch (sa->sa_family) { + case AF_INET: + key.hdr.port = addr->sin_port; + key.ipv4_addr = addr->sin_addr; + key_len += sizeof(key.ipv4_addr); + break; + + case AF_INET6: + key.hdr.port = addr6->sin6_port; + key.ipv6_addr = addr6->sin6_addr; + key_len += sizeof(key.ipv6_addr); + break; + + default: + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); + server->fscache = NULL; + return; + } + server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, server, true); + &cifs_fscache_server_index_def, + &key, key_len, + NULL, 0, + server, 0, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); } @@ -36,17 +88,29 @@ void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); - fscache_relinquish_cookie(server->fscache, 0); + fscache_relinquish_cookie(server->fscache, NULL, false); server->fscache = NULL; } void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { struct TCP_Server_Info *server = tcon->ses->server; + char *sharename; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); + tcon->fscache = NULL; + return; + } tcon->fscache = fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, tcon, true); + &cifs_fscache_super_index_def, + sharename, strlen(sharename), + &tcon->resource_id, sizeof(tcon->resource_id), + tcon, 0, true); + kfree(sharename); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server->fscache, tcon->fscache); } @@ -54,10 +118,28 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); - fscache_relinquish_cookie(tcon->fscache, 0); + fscache_relinquish_cookie(tcon->fscache, &tcon->resource_id, false); tcon->fscache = NULL; } +static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi, + struct cifs_tcon *tcon) +{ + struct cifs_fscache_inode_auxdata auxdata; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + cifsi->fscache = + fscache_acquire_cookie(tcon->fscache, + &cifs_fscache_inode_object_def, + &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &auxdata, sizeof(auxdata), + cifsi, cifsi->vfs_inode.i_size, true); +} + static void cifs_fscache_enable_inode_cookie(struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); @@ -67,21 +149,28 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifsi->fscache) return; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { - cifsi->fscache = fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, cifsi, true); - cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", - __func__, tcon->fscache, cifsi->fscache); - } + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)) + return; + + cifs_fscache_acquire_inode_cookie(cifsi, tcon); + + cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", + __func__, tcon->fscache, cifsi->fscache); } void cifs_fscache_release_inode_cookie(struct inode *inode) { + struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_relinquish_cookie(cifsi->fscache, 0); + fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); cifsi->fscache = NULL; } } @@ -93,7 +182,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) if (cifsi->fscache) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_uncache_all_inode_pages(cifsi->fscache, inode); - fscache_relinquish_cookie(cifsi->fscache, 1); + fscache_relinquish_cookie(cifsi->fscache, NULL, true); cifsi->fscache = NULL; } } @@ -110,16 +199,14 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct fscache_cookie *old = cifsi->fscache; if (cifsi->fscache) { /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(cifsi->fscache, 1); + fscache_relinquish_cookie(cifsi->fscache, NULL, true); - cifsi->fscache = fscache_acquire_cookie( - cifs_sb_master_tcon(cifs_sb)->fscache, - &cifs_fscache_inode_object_def, - cifsi, true); + cifs_fscache_acquire_inode_cookie(cifsi, tcon); cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", __func__, cifsi->fscache, old); } @@ -214,13 +301,15 @@ int __cifs_readpages_from_fscache(struct inode *inode, void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { + struct cifsInodeInfo *cifsi = CIFS_I(inode); int ret; cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, CIFS_I(inode)->fscache, page, inode); - ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + __func__, cifsi->fscache, page, inode); + ret = fscache_write_page(cifsi->fscache, page, + cifsi->vfs_inode.i_size, GFP_KERNEL); if (ret != 0) - fscache_uncache_page(CIFS_I(inode)->fscache, page); + fscache_uncache_page(cifsi->fscache, page); } void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) @@ -239,4 +328,3 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } - diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 24794b6cd8ec..c7e3ac251e16 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -27,6 +27,18 @@ #ifdef CONFIG_CIFS_FSCACHE +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +/* + * cache.c + */ extern struct fscache_netfs cifs_fscache_netfs; extern const struct fscache_cookie_def cifs_fscache_server_index_def; extern const struct fscache_cookie_def cifs_fscache_super_index_def; @@ -34,6 +46,7 @@ extern const struct fscache_cookie_def cifs_fscache_inode_object_def; extern int cifs_fscache_register(void); extern void cifs_fscache_unregister(void); +extern char *extract_sharename(const char *); /* * fscache.c diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 8f9a8cc7cc62..f856df4adae3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -707,6 +707,18 @@ cgfi_exit: return rc; } +/* Simple function to return a 64 bit hash of string. Rarely called */ +static __u64 simple_hashstr(const char *str) +{ + const __u64 hash_mult = 1125899906842597L; /* a big enough prime */ + __u64 hash = 0; + + while (*str) + hash = (hash + (__u64) *str++) * hash_mult; + + return hash; +} + int cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, int xid, @@ -816,6 +828,14 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tmprc); fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); + } else if ((fattr.cf_uniqueid == 0) && + strlen(full_path) == 0) { + /* some servers ret bad root ino ie 0 */ + cifs_dbg(FYI, "Invalid (0) inodenum\n"); + fattr.cf_flags |= + CIFS_FATTR_FAKE_ROOT_INO; + fattr.cf_uniqueid = + simple_hashstr(tcon->treeName); } } } else @@ -832,6 +852,16 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, &fattr.cf_uniqueid, data); if (tmprc) fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + else if ((fattr.cf_uniqueid == 0) && + strlen(full_path) == 0) { + /* + * Reuse existing root inode num since + * inum zero for root causes ls of . and .. to + * not be returned + */ + cifs_dbg(FYI, "Srv ret 0 inode num for root\n"); + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } } else fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; } @@ -893,6 +923,9 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } cgii_exit: + if ((*inode) && ((*inode)->i_ino == 0)) + cifs_dbg(FYI, "inode number of zero returned\n"); + kfree(buf); cifs_put_tlink(tlink); return rc; @@ -1066,10 +1099,7 @@ iget_no_retry: out: kfree(path); - /* can not call macro free_xid here since in a void func - * TODO: This is no longer true - */ - _free_xid(xid); + free_xid(xid); return inode; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 60b5a11ee11b..889a840172eb 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -50,25 +50,12 @@ static int symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) { int rc; - unsigned int size; - struct crypto_shash *md5; - struct sdesc *sdescmd5; - - md5 = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(md5)) { - rc = PTR_ERR(md5); - cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n", - __func__, rc); - return rc; - } - size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); - sdescmd5 = kmalloc(size, GFP_KERNEL); - if (!sdescmd5) { - rc = -ENOMEM; + struct crypto_shash *md5 = NULL; + struct sdesc *sdescmd5 = NULL; + + rc = cifs_alloc_hash("md5", &md5, &sdescmd5); + if (rc) goto symlink_hash_err; - } - sdescmd5->shash.tfm = md5; - sdescmd5->shash.flags = 0x0; rc = crypto_shash_init(&sdescmd5->shash); if (rc) { @@ -85,9 +72,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: - crypto_free_shash(md5); - kfree(sdescmd5); - + cifs_free_hash(&md5, &sdescmd5); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index a0dbced4a45c..460084a8eac5 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -848,3 +848,57 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); return 0; } + +/** + * cifs_alloc_hash - allocate hash and hash context together + * + * The caller has to make sure @sdesc is initialized to either NULL or + * a valid context. Both can be freed via cifs_free_hash(). + */ +int +cifs_alloc_hash(const char *name, + struct crypto_shash **shash, struct sdesc **sdesc) +{ + int rc = 0; + size_t size; + + if (*sdesc != NULL) + return 0; + + *shash = crypto_alloc_shash(name, 0, 0); + if (IS_ERR(*shash)) { + cifs_dbg(VFS, "could not allocate crypto %s\n", name); + rc = PTR_ERR(*shash); + *shash = NULL; + *sdesc = NULL; + return rc; + } + + size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash); + *sdesc = kmalloc(size, GFP_KERNEL); + if (*sdesc == NULL) { + cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name); + crypto_free_shash(*shash); + *shash = NULL; + return -ENOMEM; + } + + (*sdesc)->shash.tfm = *shash; + (*sdesc)->shash.flags = 0x0; + return 0; +} + +/** + * cifs_free_hash - free hash and hash context together + * + * Freeing a NULL hash or context is safe. + */ +void +cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) +{ + kfree(*sdesc); + *sdesc = NULL; + if (*shash) + crypto_free_shash(*shash); + *shash = NULL; +} diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3d495e440c87..aff8ce8ba34d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1122,6 +1122,7 @@ struct smb_version_values smb1_values = { .exclusive_lock_type = 0, .shared_lock_type = LOCKING_ANDX_SHARED_LOCK, .unlock_lock_type = 0, + .header_preamble_size = 4, .header_size = sizeof(struct smb_hdr), .max_header_size = MAX_CIFS_HDR_SIZE, .read_rsp_size = sizeof(READ_RSP), diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 62c88dfed57b..3bfc9c990724 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -745,7 +745,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { "STATUS_NOLOGON_SERVER_TRUST_ACCOUNT"}, {STATUS_DOMAIN_TRUST_INCONSISTENT, -EIO, "STATUS_DOMAIN_TRUST_INCONSISTENT"}, - {STATUS_FS_DRIVER_REQUIRED, -EIO, "STATUS_FS_DRIVER_REQUIRED"}, + {STATUS_FS_DRIVER_REQUIRED, -EOPNOTSUPP, "STATUS_FS_DRIVER_REQUIRED"}, {STATUS_IMAGE_ALREADY_LOADED_AS_DLL, -EIO, "STATUS_IMAGE_ALREADY_LOADED_AS_DLL"}, {STATUS_NETWORK_OPEN_RESTRICTION, -EIO, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 76d03abaa38c..5406e95f5d92 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -150,7 +150,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } return 1; } - if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) { + if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - + srvr->vals->header_preamble_size) { cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n", mid); return 1; @@ -189,26 +190,26 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } } - if (4 + len != length) { - cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n", - length, 4 + len, mid); + if (srvr->vals->header_preamble_size + len != length) { + cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n", + length, srvr->vals->header_preamble_size + len, mid); return 1; } clc_len = smb2_calc_size(hdr); - if (4 + len != clc_len) { - cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", - clc_len, 4 + len, mid); + if (srvr->vals->header_preamble_size + len != clc_len) { + cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n", + clc_len, srvr->vals->header_preamble_size + len, mid); /* create failed on symlink */ if (command == SMB2_CREATE_HE && shdr->Status == STATUS_STOPPED_ON_SYMLINK) return 0; /* Windows 7 server returns 24 bytes more */ - if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) + if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE) return 0; /* server can return one byte more due to implied bcc[0] */ - if (clc_len == 4 + len + 1) + if (clc_len == srvr->vals->header_preamble_size + len + 1) return 0; /* @@ -218,10 +219,10 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) * Log the server error (once), but allow it and continue * since the frame is parseable. */ - if (clc_len < 4 /* RFC1001 header size */ + len) { + if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) { printk_once(KERN_WARNING - "SMB2 server sent bad RFC1001 len %d not %d\n", - len, clc_len - 4); + "SMB2 server sent bad RFC1001 len %d not %zu\n", + len, clc_len - srvr->vals->header_preamble_size); return 0; } @@ -706,3 +707,67 @@ smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) return 0; } + +#ifdef CONFIG_CIFS_SMB311 +/** + * smb311_update_preauth_hash - update @ses hash with the packet data in @iov + * + * Assumes @iov does not contain the rfc1002 length and iov[0] has the + * SMB2 header. + */ +int +smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) +{ + int i, rc; + struct sdesc *d; + struct smb2_sync_hdr *hdr; + + if (ses->server->tcpStatus == CifsGood) { + /* skip non smb311 connections */ + if (ses->server->dialect != SMB311_PROT_ID) + return 0; + + /* skip last sess setup response */ + hdr = (struct smb2_sync_hdr *)iov[0].iov_base; + if (hdr->Flags & SMB2_FLAGS_SIGNED) + return 0; + } + + rc = smb311_crypto_shash_allocate(ses->server); + if (rc) + return rc; + + d = ses->server->secmech.sdescsha512; + rc = crypto_shash_init(&d->shash); + if (rc) { + cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__); + return rc; + } + + rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash, + SMB2_PREAUTH_HASH_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: could not update sha512 shash\n", __func__); + return rc; + } + + for (i = 0; i < nvec; i++) { + rc = crypto_shash_update(&d->shash, + iov[i].iov_base, iov[i].iov_len); + if (rc) { + cifs_dbg(VFS, "%s: could not update sha512 shash\n", + __func__); + return rc; + } + } + + rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash); + if (rc) { + cifs_dbg(VFS, "%s: could not finalize sha512 shash\n", + __func__); + return rc; + } + + return 0; +} +#endif diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eb68e2fcc500..968b1d43a1ea 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1412,7 +1412,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } while (rc == -EAGAIN); if (rc) { - if (rc != -ENOENT) + if ((rc != -ENOENT) && (rc != -EOPNOTSUPP)) cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); goto out; } @@ -1457,6 +1457,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, unsigned int sub_offset; unsigned int print_len; unsigned int print_offset; + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -1479,7 +1481,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) { + get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { kfree(utf16_path); return -ENOENT; } @@ -1492,13 +1494,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, print_len = le16_to_cpu(symlink->PrintNameLength); print_offset = le16_to_cpu(symlink->PrintNameOffset); - if (get_rfc1002_length(err_buf) + 4 < + if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { kfree(utf16_path); return -ENOENT; } - if (get_rfc1002_length(err_buf) + 4 < + if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { kfree(utf16_path); return -ENOENT; @@ -2050,7 +2052,8 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile) } static void -fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq) +fill_transform_hdr(struct TCP_Server_Info *server, + struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base; @@ -2062,10 +2065,19 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq) tr_hdr->Flags = cpu_to_le16(0x01); get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); - inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4); + inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size); inc_rfc1001_len(tr_hdr, orig_len); } +/* We can not use the normal sg_set_buf() as we will sometimes pass a + * stack object as buf. + */ +static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + static struct scatterlist * init_sg(struct smb_rqst *rqst, u8 *sign) { @@ -2080,16 +2092,16 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return NULL; sg_init_table(sg, sg_len); - sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len); + smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len); for (i = 1; i < rqst->rq_nvec; i++) - sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, + smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); for (j = 0; i < sg_len - 1; i++, j++) { unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz : rqst->rq_tailsz; sg_set_page(&sg[i], rqst->rq_pages[j], len, 0); } - sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE); + smb2_sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE); return sg; } @@ -2125,7 +2137,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) { struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; - unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size; int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; @@ -2253,7 +2265,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, goto err_free_iov; /* fill the 1st iov with a transform header */ - fill_transform_hdr(tr_hdr, old_rq); + fill_transform_hdr(server, tr_hdr, old_rq); new_rq->rq_iov[0].iov_base = tr_hdr; new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr); @@ -2335,10 +2347,10 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, if (rc) return rc; - memmove(buf + 4, iov[1].iov_base, buf_data_size); + memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size); hdr = (struct smb2_hdr *)buf; hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size); - server->total_read = buf_data_size + page_data_size + 4; + server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size; return rc; } @@ -2442,7 +2454,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - data_offset = server->ops->read_data_offset(buf) + 4; + data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size; #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; #endif @@ -2538,11 +2550,12 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) unsigned int npages; struct page **pages; unsigned int len; - unsigned int buflen = get_rfc1002_length(buf) + 4; + unsigned int buflen = get_rfc1002_length(buf) + server->vals->header_preamble_size; int rc; int i = 0; - len = min_t(unsigned int, buflen, server->vals->read_rsp_size - 4 + + len = min_t(unsigned int, buflen, server->vals->read_rsp_size - + server->vals->header_preamble_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); @@ -2550,8 +2563,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) return rc; server->total_read += rc; - len = le32_to_cpu(tr_hdr->OriginalMessageSize) + 4 - - server->vals->read_rsp_size; + len = le32_to_cpu(tr_hdr->OriginalMessageSize) + + server->vals->header_preamble_size - + server->vals->read_rsp_size; npages = DIV_ROUND_UP(len, PAGE_SIZE); pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); @@ -2577,7 +2591,8 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) if (rc) goto free_pages; - rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - 4, + rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - + server->vals->header_preamble_size, pages, npages, len); if (rc) goto free_pages; @@ -2614,7 +2629,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid_entry; /* switch to large buffer if too big for a small one */ - if (pdu_length + 4 > MAX_CIFS_SMALL_BUFFER_SIZE) { + if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) { server->large_buf = true; memcpy(server->bigbuf, buf, server->total_read); buf = server->bigbuf; @@ -2622,12 +2637,13 @@ receive_encrypted_standard(struct TCP_Server_Info *server, /* now read the rest */ length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, - pdu_length - HEADER_SIZE(server) + 1 + 4); + pdu_length - HEADER_SIZE(server) + 1 + + server->vals->header_preamble_size); if (length < 0) return length; server->total_read += length; - buf_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr); + buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr); length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); if (length) return length; @@ -2656,7 +2672,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - if (pdu_length + 4 < sizeof(struct smb2_transform_hdr) + + if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_sync_hdr)) { cifs_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); @@ -2665,14 +2681,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) return -ECONNABORTED; } - if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) { + if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) { cifs_dbg(VFS, "Transform message is broken\n"); cifs_reconnect(server); wake_up(&server->response_q); return -ECONNABORTED; } - if (pdu_length + 4 > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) + if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) return receive_encrypted_read(server, mid); return receive_encrypted_standard(server, mid); @@ -2683,7 +2699,8 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + 4, + return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + + server->vals->header_preamble_size, NULL, 0, 0); } @@ -3088,6 +3105,7 @@ struct smb_version_values smb20_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3108,6 +3126,7 @@ struct smb_version_values smb21_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3128,6 +3147,7 @@ struct smb_version_values smb3any_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3148,6 +3168,7 @@ struct smb_version_values smbdefault_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3168,6 +3189,7 @@ struct smb_version_values smb30_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3188,6 +3210,7 @@ struct smb_version_values smb302_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3209,6 +3232,7 @@ struct smb_version_values smb311_values = { .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), + .header_preamble_size = 4, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 63778ac22fd9..f7741cee2a4c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -453,6 +453,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) return rc; req->sync_hdr.SessionId = 0; +#ifdef CONFIG_CIFS_SMB311 + memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); + memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); +#endif if (strcmp(ses->server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { @@ -564,6 +568,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* BB: add check that dialect was valid given dialect(s) we asked for */ +#ifdef CONFIG_CIFS_SMB311 + /* + * Keep a copy of the hash after negprot. This hash will be + * the starting hash value for all sessions made from this + * server. + */ + memcpy(server->preauth_sha_hash, ses->preauth_sha_hash, + SMB2_PREAUTH_HASH_SIZE); +#endif /* SMB2 only has an extended negflavor */ server->negflavor = CIFS_NEGFLAVOR_EXTENDED; /* set it to the maximum buffer size value we can send with 1 credit */ @@ -571,8 +584,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) SMB2_MAX_BUFFER_SIZE); server->max_read = le32_to_cpu(rsp->MaxReadSize); server->max_write = le32_to_cpu(rsp->MaxWriteSize); - /* BB Do we need to validate the SecurityMode? */ server->sec_mode = le16_to_cpu(rsp->SecurityMode); + if ((server->sec_mode & SMB2_SEC_MODE_FLAGS_ALL) != server->sec_mode) + cifs_dbg(FYI, "Server returned unexpected security mode 0x%x\n", + server->sec_mode); server->capabilities = le32_to_cpu(rsp->Capabilities); /* Internal types */ server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; @@ -621,6 +636,10 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) return 0; #endif + /* In SMB3.11 preauth integrity supersedes validate negotiate */ + if (tcon->ses->server->dialect == SMB311_PROT_ID) + return 0; + /* * validation ioctl must be signed, so no point sending this if we * can not sign it (ie are not known user). Even if signing is not @@ -1148,6 +1167,14 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; +#ifdef CONFIG_CIFS_SMB311 + /* + * Initialize the session hash with the server one. + */ + memcpy(ses->preauth_sha_hash, ses->server->preauth_sha_hash, + SMB2_PREAUTH_HASH_SIZE); +#endif + while (sess_data->func) sess_data->func(sess_data); @@ -1280,6 +1307,11 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; + /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ + if ((ses->server->dialect == SMB311_PROT_ID) && + !encryption_required(tcon)) + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1441,7 +1473,7 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, unsigned int remaining; char *name; - data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); + data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset); remaining = le32_to_cpu(rsp->CreateContextsLength); cc = (struct create_context *)data_offset; while (remaining >= sizeof(struct create_context)) { @@ -1738,8 +1770,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, path); - if (rc) + if (rc) { + cifs_small_buf_release(req); return rc; + } req->NameLength = cpu_to_le16(name_len * 2); uni_path_len = copy_size; path = copy_path; @@ -1750,8 +1784,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (uni_path_len % 8 != 0) { copy_size = roundup(uni_path_len, 8); copy_path = kzalloc(copy_size, GFP_KERNEL); - if (!copy_path) + if (!copy_path) { + cifs_small_buf_release(req); return -ENOMEM; + } memcpy((char *)copy_path, (const char *)path, uni_path_len); uni_path_len = copy_size; @@ -3418,6 +3454,7 @@ static int build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, int outbuf_len, u64 persistent_fid, u64 volatile_fid) { + struct TCP_Server_Info *server = tcon->ses->server; int rc; struct smb2_query_info_req *req; unsigned int total_len; @@ -3440,7 +3477,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, req->InputBufferOffset = cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4); + outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size); iov->iov_base = (char *)req; iov->iov_len = total_len; @@ -3457,6 +3494,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; @@ -3477,7 +3515,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, } rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ + + info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size + le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, @@ -3500,6 +3538,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; unsigned int rsp_len, offset; int flags = 0; @@ -3540,15 +3579,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, goto qfsattr_exit; if (level == FS_ATTRIBUTE_INFORMATION) - memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset + memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset + (char *)&rsp->hdr, min_t(unsigned int, rsp_len, max_len)); else if (level == FS_DEVICE_INFORMATION) - memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset + memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO)); else if (level == FS_SECTOR_SIZE_INFORMATION) { struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *) - (4 /* RFC1001 len */ + offset + (char *)&rsp->hdr); + (server->vals->header_preamble_size + offset + (char *)&rsp->hdr); tcon->ss_flags = le32_to_cpu(ss_info->Flags); tcon->perf_sector_size = le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 2a2b34ccaf49..253e2c7c952f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -249,6 +249,8 @@ struct smb2_negotiate_req { /* SecurityMode flags */ #define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 #define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 +#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 + /* Capabilities flags */ #define SMB2_GLOBAL_CAP_DFS 0x00000001 #define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ @@ -264,6 +266,7 @@ struct smb2_negotiate_req { #define SMB311_SALT_SIZE 32 /* Hash Algorithm Types */ #define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) +#define SMB2_PREAUTH_HASH_SIZE 64 struct smb2_preauth_neg_context { __le16 ContextType; /* 1 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 05287b01f596..cbcce3f7e86f 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -202,4 +202,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); +#ifdef CONFIG_CIFS_SMB311 +extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); +extern int smb311_update_preauth_hash(struct cifs_ses *ses, + struct kvec *iov, int nvec); +#endif #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 99493946e2f9..bf49cb73b9e6 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -43,77 +43,62 @@ static int smb2_crypto_shash_allocate(struct TCP_Server_Info *server) { - int rc; - unsigned int size; + return cifs_alloc_hash("hmac(sha256)", + &server->secmech.hmacsha256, + &server->secmech.sdeschmacsha256); +} - if (server->secmech.sdeschmacsha256 != NULL) - return 0; /* already allocated */ +static int +smb3_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + struct cifs_secmech *p = &server->secmech; + int rc; - server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); - if (IS_ERR(server->secmech.hmacsha256)) { - cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); - rc = PTR_ERR(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - return rc; - } + rc = cifs_alloc_hash("hmac(sha256)", + &p->hmacsha256, + &p->sdeschmacsha256); + if (rc) + goto err; - size = sizeof(struct shash_desc) + - crypto_shash_descsize(server->secmech.hmacsha256); - server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL); - if (!server->secmech.sdeschmacsha256) { - crypto_free_shash(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - return -ENOMEM; - } - server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256; - server->secmech.sdeschmacsha256->shash.flags = 0x0; + rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + if (rc) + goto err; return 0; +err: + cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + return rc; } -static int -smb3_crypto_shash_allocate(struct TCP_Server_Info *server) +#ifdef CONFIG_CIFS_SMB311 +int +smb311_crypto_shash_allocate(struct TCP_Server_Info *server) { - unsigned int size; - int rc; - - if (server->secmech.sdesccmacaes != NULL) - return 0; /* already allocated */ + struct cifs_secmech *p = &server->secmech; + int rc = 0; - rc = smb2_crypto_shash_allocate(server); + rc = cifs_alloc_hash("hmac(sha256)", + &p->hmacsha256, + &p->sdeschmacsha256); if (rc) return rc; - server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0); - if (IS_ERR(server->secmech.cmacaes)) { - cifs_dbg(VFS, "could not allocate crypto cmac-aes"); - kfree(server->secmech.sdeschmacsha256); - server->secmech.sdeschmacsha256 = NULL; - crypto_free_shash(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - rc = PTR_ERR(server->secmech.cmacaes); - server->secmech.cmacaes = NULL; - return rc; - } + rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + if (rc) + goto err; - size = sizeof(struct shash_desc) + - crypto_shash_descsize(server->secmech.cmacaes); - server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL); - if (!server->secmech.sdesccmacaes) { - cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__); - kfree(server->secmech.sdeschmacsha256); - server->secmech.sdeschmacsha256 = NULL; - crypto_free_shash(server->secmech.hmacsha256); - crypto_free_shash(server->secmech.cmacaes); - server->secmech.hmacsha256 = NULL; - server->secmech.cmacaes = NULL; - return -ENOMEM; - } - server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes; - server->secmech.sdesccmacaes->shash.flags = 0x0; + rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512); + if (rc) + goto err; return 0; + +err: + cifs_free_hash(&p->cmacaes, &p->sdesccmacaes); + cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + return rc; } +#endif static struct cifs_ses * smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) @@ -457,7 +442,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); return rc; } - + rc = __cifs_calc_signature(rqst, server, sigptr, &server->secmech.sdesccmacaes->shash); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 91710eb571fb..5008af546dd1 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -862,6 +862,8 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) ib_dma_unmap_single(info->id->device, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); + smbd_disconnect_rdma_connection(info); + dma_mapping_failed: mempool_free(request, info->request_mempool); return rc; @@ -1025,7 +1027,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, - "rdma_request sge[%d] addr=%llu legnth=%u\n", + "rdma_request sge[%d] addr=%llu length=%u\n", i, request->sge[0].addr, request->sge[0].length); ib_dma_sync_single_for_device( info->id->device, @@ -1061,6 +1063,7 @@ static int smbd_post_send(struct smbd_connection *info, if (atomic_dec_and_test(&info->send_pending)) wake_up(&info->wait_send_pending); } + smbd_disconnect_rdma_connection(info); } else /* Reset timer for idle connection after packet is sent */ mod_delayed_work(info->workqueue, &info->idle_timer_work, @@ -1202,7 +1205,7 @@ static int smbd_post_recv( if (rc) { ib_dma_unmap_single(info->id->device, response->sge.addr, response->sge.length, DMA_FROM_DEVICE); - + smbd_disconnect_rdma_connection(info); log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); } @@ -1498,8 +1501,8 @@ int smbd_reconnect(struct TCP_Server_Info *server) log_rdma_event(INFO, "reconnecting rdma session\n"); if (!server->smbd_conn) { - log_rdma_event(ERR, "rdma session already destroyed\n"); - return -EINVAL; + log_rdma_event(INFO, "rdma session already destroyed\n"); + goto create_conn; } /* @@ -1512,15 +1515,19 @@ int smbd_reconnect(struct TCP_Server_Info *server) } /* wait until the transport is destroyed */ - wait_event(server->smbd_conn->wait_destroy, - server->smbd_conn->transport_status == SMBD_DESTROYED); + if (!wait_event_timeout(server->smbd_conn->wait_destroy, + server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ)) + return -EAGAIN; destroy_workqueue(server->smbd_conn->workqueue); kfree(server->smbd_conn); +create_conn: log_rdma_event(INFO, "creating rdma session\n"); server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); + log_rdma_event(INFO, "created rdma session info=%p\n", + server->smbd_conn); return server->smbd_conn ? 0 : -ENOENT; } @@ -2295,7 +2302,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) rc = ib_dereg_mr(smbdirect_mr->mr); if (rc) { log_rdma_mr(ERR, - "ib_dereg_mr faield rc=%x\n", + "ib_dereg_mr failed rc=%x\n", rc); smbd_disconnect_rdma_connection(info); } @@ -2542,6 +2549,8 @@ dma_map_error: if (atomic_dec_and_test(&info->mr_used_count)) wake_up(&info->wait_for_mr_cleanup); + smbd_disconnect_rdma_connection(info); + return NULL; } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index c12bffefa3c9..a0b80ac651a6 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -121,25 +121,12 @@ int mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) { int rc; - unsigned int size; - struct crypto_shash *md4; - struct sdesc *sdescmd4; - - md4 = crypto_alloc_shash("md4", 0, 0); - if (IS_ERR(md4)) { - rc = PTR_ERR(md4); - cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n", - __func__, rc); - return rc; - } - size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); - sdescmd4 = kmalloc(size, GFP_KERNEL); - if (!sdescmd4) { - rc = -ENOMEM; + struct crypto_shash *md4 = NULL; + struct sdesc *sdescmd4 = NULL; + + rc = cifs_alloc_hash("md4", &md4, &sdescmd4); + if (rc) goto mdfour_err; - } - sdescmd4->shash.tfm = md4; - sdescmd4->shash.flags = 0x0; rc = crypto_shash_init(&sdescmd4->shash); if (rc) { @@ -156,9 +143,7 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__); mdfour_err: - crypto_free_shash(md4); - kfree(sdescmd4); - + cifs_free_hash(&md4, &sdescmd4); return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9779b3292d8e..279718dcb2ed 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -37,6 +37,7 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "smb2proto.h" #include "smbdirect.h" /* Max number of iovectors we can use off the stack when sending requests. */ @@ -751,6 +752,12 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) goto out; +#ifdef CONFIG_CIFS_SMB311 + if (ses->status == CifsNew) + smb311_update_preauth_hash(ses, rqst->rq_iov+1, + rqst->rq_nvec-1); +#endif + if (timeout == CIFS_ASYNC_OP) goto out; @@ -783,12 +790,23 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, buf = (char *)midQ->resp_buf; resp_iov->iov_base = buf; - resp_iov->iov_len = get_rfc1002_length(buf) + 4; + resp_iov->iov_len = get_rfc1002_length(buf) + + ses->server->vals->header_preamble_size; if (midQ->large_buf) *resp_buf_type = CIFS_LARGE_BUFFER; else *resp_buf_type = CIFS_SMALL_BUFFER; +#ifdef CONFIG_CIFS_SMB311 + if (ses->status == CifsNew) { + struct kvec iov = { + .iov_base = buf + 4, + .iov_len = get_rfc1002_length(buf) + }; + smb311_update_preauth_hash(ses, &iov, 1); + } +#endif + credits = ses->server->ops->get_credits(midQ); rc = ses->server->ops->check_receive(midQ, ses->server, diff --git a/fs/d_path.c b/fs/d_path.c new file mode 100644 index 000000000000..e8fce6b1174f --- /dev/null +++ b/fs/d_path.c @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/syscalls.h> +#include <linux/export.h> +#include <linux/uaccess.h> +#include <linux/fs_struct.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/prefetch.h> +#include "mount.h" + +static int prepend(char **buffer, int *buflen, const char *str, int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) + return -ENAMETOOLONG; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * @buffer: buffer pointer + * @buflen: allocated length of the buffer + * @name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + * + * Load acquire is needed to make sure that we see that terminating NUL. + */ +static int prepend_name(char **buffer, int *buflen, const struct qstr *name) +{ + const char *dname = smp_load_acquire(&name->name); /* ^^^ */ + u32 dlen = READ_ONCE(name->len); + char *p; + + *buflen -= dlen + 1; + if (*buflen < 0) + return -ENAMETOOLONG; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; +} + +/** + * prepend_path - Prepend path string to a buffer + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry + * @buffer: pointer to the end of the buffer + * @buflen: pointer to buffer length + * + * The function will first try to write out the pathname without taking any + * lock other than the RCU read lock to make sure that dentries won't go away. + * It only checks the sequence number of the global rename_lock as any change + * in the dentry's d_seq will be preceded by changes in the rename_lock + * sequence number. If the sequence number had been changed, it will restart + * the whole pathname back-tracing sequence again by taking the rename_lock. + * In this case, there is no need to take the RCU read lock as the recursive + * parent pointer references will keep the dentry chain alive as long as no + * rename operation is performed. + */ +static int prepend_path(const struct path *path, + const struct path *root, + char **buffer, int *buflen) +{ + struct dentry *dentry; + struct vfsmount *vfsmnt; + struct mount *mnt; + int error = 0; + unsigned seq, m_seq = 0; + char *bptr; + int blen; + + rcu_read_lock(); +restart_mnt: + read_seqbegin_or_lock(&mount_lock, &m_seq); + seq = 0; + rcu_read_lock(); +restart: + bptr = *buffer; + blen = *buflen; + error = 0; + dentry = path->dentry; + vfsmnt = path->mnt; + mnt = real_mount(vfsmnt); + read_seqbegin_or_lock(&rename_lock, &seq); + while (dentry != root->dentry || vfsmnt != root->mnt) { + struct dentry * parent; + + if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { + struct mount *parent = READ_ONCE(mnt->mnt_parent); + /* Escaped? */ + if (dentry != vfsmnt->mnt_root) { + bptr = *buffer; + blen = *buflen; + error = 3; + break; + } + /* Global root? */ + if (mnt != parent) { + dentry = READ_ONCE(mnt->mnt_mountpoint); + mnt = parent; + vfsmnt = &mnt->mnt; + continue; + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; + } + parent = dentry->d_parent; + prefetch(parent); + error = prepend_name(&bptr, &blen, &dentry->d_name); + if (error) + break; + + dentry = parent; + } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + + if (!(m_seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&mount_lock, m_seq)) { + m_seq = 1; + goto restart_mnt; + } + done_seqretry(&mount_lock, m_seq); + + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; + return error; +} + +/** + * __d_path - return the path of a dentry + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry + * @buf: buffer to return value in + * @buflen: buffer length + * + * Convert a dentry into an ASCII path name. + * + * Returns a pointer into the buffer or an error code if the + * path was too long. + * + * "buflen" should be positive. + * + * If the path is not reachable from the supplied root, return %NULL. + */ +char *__d_path(const struct path *path, + const struct path *root, + char *buf, int buflen) +{ + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + error = prepend_path(path, root, &res, &buflen); + + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + error = prepend_path(path, &root, &res, &buflen); + + if (error > 1) + error = -EINVAL; + if (error < 0) + return ERR_PTR(error); + return res; +} + +/* + * same as __d_path but appends "(deleted)" for unlinked files. + */ +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) +{ + prepend(buf, buflen, "\0", 1); + if (d_unlinked(path->dentry)) { + int error = prepend(buf, buflen, " (deleted)", 10); + if (error) + return error; + } + + return prepend_path(path, root, buf, buflen); +} + +static int prepend_unreachable(char **buffer, int *buflen) +{ + return prepend(buffer, buflen, "(unreachable)", 13); +} + +static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); +} + +/** + * d_path - return the path of a dentry + * @path: path to report + * @buf: buffer to return value in + * @buflen: buffer length + * + * Convert a dentry into an ASCII path name. If the entry has been deleted + * the string " (deleted)" is appended. Note that this is ambiguous. + * + * Returns a pointer into the buffer or an error code if the path was + * too long. Note: Callers should use the returned pointer, not the passed + * in buffer, to use the name! The implementation often starts at an offset + * into the buffer, and may leave 0 bytes at the start. + * + * "buflen" should be positive. + */ +char *d_path(const struct path *path, char *buf, int buflen) +{ + char *res = buf + buflen; + struct path root; + int error; + + /* + * We have various synthetic filesystems that never get mounted. On + * these filesystems dentries are never used for lookup purposes, and + * thus don't need to be hashed. They also don't need a name until a + * user wants to identify the object in /proc/pid/fd/. The little hack + * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. + */ + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) + return path->dentry->d_op->d_dname(path->dentry, buf, buflen); + + rcu_read_lock(); + get_fs_root_rcu(current->fs, &root); + error = path_with_deleted(path, &root, &res, &buflen); + rcu_read_unlock(); + + if (error < 0) + res = ERR_PTR(error); + return res; +} +EXPORT_SYMBOL(d_path); + +/* + * Helper function for dentry_operations.d_dname() members + */ +char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, + const char *fmt, ...) +{ + va_list args; + char temp[64]; + int sz; + + va_start(args, fmt); + sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; + va_end(args); + + if (sz > sizeof(temp) || sz > buflen) + return ERR_PTR(-ENAMETOOLONG); + + buffer += buflen - sz; + return memcpy(buffer, temp, sz); +} + +char *simple_dname(struct dentry *dentry, char *buffer, int buflen) +{ + char *end = buffer + buflen; + /* these dentries are never renamed, so d_lock is not needed */ + if (prepend(&end, &buflen, " (deleted)", 11) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || + prepend(&end, &buflen, "/", 1)) + end = ERR_PTR(-ENAMETOOLONG); + return end; +} +EXPORT_SYMBOL(simple_dname); + +/* + * Write full pathname from the root of the filesystem into the buffer. + */ +static char *__dentry_path(struct dentry *d, char *buf, int buflen) +{ + struct dentry *dentry; + char *end, *retval; + int len, seq = 0; + int error = 0; + + if (buflen < 2) + goto Elong; + + rcu_read_lock(); +restart: + dentry = d; + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); + /* Get '/' right */ + retval = end-1; + *retval = '/'; + read_seqbegin_or_lock(&rename_lock, &seq); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dentry->d_parent; + + prefetch(parent); + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; + + retval = end; + dentry = parent; + } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + if (error) + goto Elong; + return retval; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +{ + return __dentry_path(dentry, buf, buflen); +} +EXPORT_SYMBOL(dentry_path_raw); + +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *p = NULL; + char *retval; + + if (d_unlinked(dentry)) { + p = buf + buflen; + if (prepend(&p, &buflen, "//deleted", 10) != 0) + goto Elong; + buflen++; + } + retval = __dentry_path(dentry, buf, buflen); + if (!IS_ERR(retval) && p) + *p = '/'; /* restore '/' overriden with '\0' */ + return retval; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, + struct path *pwd) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + *pwd = fs->pwd; + } while (read_seqcount_retry(&fs->seq, seq)); +} + +/* + * NOTE! The user-level library version returns a + * character pointer. The kernel system call just + * returns the length of the buffer filled (which + * includes the ending '\0' character), or a negative + * error value. So libc would do something like + * + * char *getcwd(char * buf, size_t size) + * { + * int retval; + * + * retval = sys_getcwd(buf, size); + * if (retval >= 0) + * return buf; + * errno = -retval; + * return NULL; + * } + */ +SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) +{ + int error; + struct path pwd, root; + char *page = __getname(); + + if (!page) + return -ENOMEM; + + rcu_read_lock(); + get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); + + error = -ENOENT; + if (!d_unlinked(pwd.dentry)) { + unsigned long len; + char *cwd = page + PATH_MAX; + int buflen = PATH_MAX; + + prepend(&cwd, &buflen, "\0", 1); + error = prepend_path(&pwd, &root, &cwd, &buflen); + rcu_read_unlock(); + + if (error < 0) + goto out; + + /* Unreachable from current root */ + if (error > 0) { + error = prepend_unreachable(&cwd, &buflen); + if (error) + goto out; + } + + error = -ERANGE; + len = PATH_MAX + page - cwd; + if (len <= size) { + error = len; + if (copy_to_user(buf, cwd, len)) + error = -EFAULT; + } + } else { + rcu_read_unlock(); + } + +out: + __putname(page); + return error; +} diff --git a/fs/dcache.c b/fs/dcache.c index 7c38f39958bc..593079176123 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -14,7 +14,7 @@ * the dcache entry is deleted or garbage collected. */ -#include <linux/syscalls.h> +#include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> @@ -24,18 +24,11 @@ #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> -#include <linux/mount.h> -#include <linux/file.h> -#include <linux/uaccess.h> #include <linux/security.h> #include <linux/seqlock.h> -#include <linux/swap.h> #include <linux/bootmem.h> -#include <linux/fs_struct.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> -#include <linux/prefetch.h> -#include <linux/ratelimit.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" @@ -74,9 +67,7 @@ * dentry->d_lock * * If no ancestor relationship: - * if (dentry1 < dentry2) - * dentry1->d_lock - * dentry2->d_lock + * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -440,17 +431,6 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, list_lru_isolate_move(lru, &dentry->d_lru, list); } -/* - * dentry_lru_(add|del)_list) must be called with d_lock held. - */ -static void dentry_lru_add(struct dentry *dentry) -{ - if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) - d_lru_add(dentry); - else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) - dentry->d_flags |= DCACHE_REFERENCED; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -470,30 +450,29 @@ static void dentry_lru_add(struct dentry *dentry) */ static void ___d_drop(struct dentry *dentry) { - if (!d_unhashed(dentry)) { - struct hlist_bl_head *b; - /* - * Hashed dentries are normally on the dentry hashtable, - * with the exception of those newly allocated by - * d_obtain_root, which are always IS_ROOT: - */ - if (unlikely(IS_ROOT(dentry))) - b = &dentry->d_sb->s_roots; - else - b = d_hash(dentry->d_name.hash); + struct hlist_bl_head *b; + /* + * Hashed dentries are normally on the dentry hashtable, + * with the exception of those newly allocated by + * d_obtain_root, which are always IS_ROOT: + */ + if (unlikely(IS_ROOT(dentry))) + b = &dentry->d_sb->s_roots; + else + b = d_hash(dentry->d_name.hash); - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); - hlist_bl_unlock(b); - /* After this call, in-progress rcu-walk path lookup will fail. */ - write_seqcount_invalidate(&dentry->d_seq); - } + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { - ___d_drop(dentry); - dentry->d_hash.pprev = NULL; + if (!d_unhashed(dentry)) { + ___d_drop(dentry); + dentry->d_hash.pprev = NULL; + write_seqcount_invalidate(&dentry->d_seq); + } } EXPORT_SYMBOL(__d_drop); @@ -589,47 +568,9 @@ static void __dentry_kill(struct dentry *dentry) dentry_free(dentry); } -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry *dentry_kill(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct inode *inode = dentry->d_inode; - struct dentry *parent = NULL; - - if (inode && unlikely(!spin_trylock(&inode->i_lock))) - goto failed; - - if (!IS_ROOT(dentry)) { - parent = dentry->d_parent; - if (unlikely(!spin_trylock(&parent->d_lock))) { - if (inode) - spin_unlock(&inode->i_lock); - goto failed; - } - } - - __dentry_kill(dentry); - return parent; - -failed: - spin_unlock(&dentry->d_lock); - return dentry; /* try again with same dentry */ -} - -static inline struct dentry *lock_parent(struct dentry *dentry) +static struct dentry *__lock_parent(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; - if (IS_ROOT(dentry)) - return NULL; - if (unlikely(dentry->d_lockref.count < 0)) - return NULL; - if (likely(spin_trylock(&parent->d_lock))) - return parent; + struct dentry *parent; rcu_read_lock(); spin_unlock(&dentry->d_lock); again: @@ -655,6 +596,91 @@ again: return parent; } +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + return __lock_parent(dentry); +} + +static inline bool retain_dentry(struct dentry *dentry) +{ + WARN_ON(d_in_lookup(dentry)); + + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + return false; + + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + return false; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { + if (dentry->d_op->d_delete(dentry)) + return false; + } + /* retain; LRU fodder */ + dentry->d_lockref.count--; + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); + else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + dentry->d_flags |= DCACHE_REFERENCED; + return true; +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto slow_positive; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + parent = __lock_parent(dentry); + if (likely(inode || !dentry->d_inode)) + goto got_locks; + /* negative that became positive */ + if (parent) + spin_unlock(&parent->d_lock); + inode = dentry->d_inode; + goto slow_positive; + } + } + __dentry_kill(dentry); + return parent; + +slow_positive: + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + parent = lock_parent(dentry); +got_locks: + if (unlikely(dentry->d_lockref.count != 1)) { + dentry->d_lockref.count--; + } else if (likely(!retain_dentry(dentry))) { + __dentry_kill(dentry); + return parent; + } + /* we are keeping it, after all */ + if (inode) + spin_unlock(&inode->i_lock); + if (parent) + spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + return NULL; +} + /* * Try to do a lockless dput(), and return whether that was successful. * @@ -802,27 +828,11 @@ repeat: /* Slow case: now with the dentry lock held */ rcu_read_unlock(); - WARN_ON(d_in_lookup(dentry)); - - /* Unreachable? Get rid of it */ - if (unlikely(d_unhashed(dentry))) - goto kill_it; - - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) - goto kill_it; - - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { - if (dentry->d_op->d_delete(dentry)) - goto kill_it; + if (likely(retain_dentry(dentry))) { + spin_unlock(&dentry->d_lock); + return; } - dentry_lru_add(dentry); - - dentry->d_lockref.count--; - spin_unlock(&dentry->d_lock); - return; - -kill_it: dentry = dentry_kill(dentry); if (dentry) { cond_resched(); @@ -971,56 +981,83 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -static void shrink_dentry_list(struct list_head *list) +/* + * Lock a dentry from shrink list. + * Called under rcu_read_lock() and dentry->d_lock; the former + * guarantees that nothing we access will be freed under us. + * Note that dentry is *not* protected from concurrent dentry_kill(), + * d_delete(), etc. + * + * Return false if dentry has been disrupted or grabbed, leaving + * the caller to kick it off-list. Otherwise, return true and have + * that dentry's inode and parent both locked. + */ +static bool shrink_lock_dentry(struct dentry *dentry) { - struct dentry *dentry, *parent; + struct inode *inode; + struct dentry *parent; - while (!list_empty(list)) { - struct inode *inode; - dentry = list_entry(list->prev, struct dentry, d_lru); + if (dentry->d_lockref.count) + return false; + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); - parent = lock_parent(dentry); + if (unlikely(dentry->d_lockref.count)) + goto out; + /* changed inode means that somebody had grabbed it */ + if (unlikely(inode != dentry->d_inode)) + goto out; + } - /* - * The dispose list is isolated and dentries are not accounted - * to the LRU here, so we can simply remove it from the list - * here regardless of whether it is referenced or not. - */ - d_shrink_del(dentry); + parent = dentry->d_parent; + if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) + return true; - /* - * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free it. - */ - if (dentry->d_lockref.count > 0) { - spin_unlock(&dentry->d_lock); - if (parent) - spin_unlock(&parent->d_lock); - continue; - } + spin_unlock(&dentry->d_lock); + spin_lock(&parent->d_lock); + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + spin_lock(&dentry->d_lock); + goto out; + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (likely(!dentry->d_lockref.count)) + return true; + spin_unlock(&parent->d_lock); +out: + if (inode) + spin_unlock(&inode->i_lock); + return false; +} +static void shrink_dentry_list(struct list_head *list) +{ + while (!list_empty(list)) { + struct dentry *dentry, *parent; - if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { - bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + dentry = list_entry(list->prev, struct dentry, d_lru); + spin_lock(&dentry->d_lock); + rcu_read_lock(); + if (!shrink_lock_dentry(dentry)) { + bool can_free = false; + rcu_read_unlock(); + d_shrink_del(dentry); + if (dentry->d_lockref.count < 0) + can_free = dentry->d_flags & DCACHE_MAY_FREE; spin_unlock(&dentry->d_lock); - if (parent) - spin_unlock(&parent->d_lock); if (can_free) dentry_free(dentry); continue; } - - inode = dentry->d_inode; - if (inode && unlikely(!spin_trylock(&inode->i_lock))) { - d_shrink_add(dentry, list); - spin_unlock(&dentry->d_lock); - if (parent) - spin_unlock(&parent->d_lock); - continue; - } - + rcu_read_unlock(); + d_shrink_del(dentry); + parent = dentry->d_parent; __dentry_kill(dentry); - + if (parent == dentry) + continue; /* * We need to prune ancestors too. This is necessary to prevent * quadratic behavior of shrink_dcache_parent(), but is also @@ -1028,26 +1065,8 @@ static void shrink_dentry_list(struct list_head *list) * fragmentation. */ dentry = parent; - while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { - parent = lock_parent(dentry); - if (dentry->d_lockref.count != 1) { - dentry->d_lockref.count--; - spin_unlock(&dentry->d_lock); - if (parent) - spin_unlock(&parent->d_lock); - break; - } - inode = dentry->d_inode; /* can't be NULL */ - if (unlikely(!spin_trylock(&inode->i_lock))) { - spin_unlock(&dentry->d_lock); - if (parent) - spin_unlock(&parent->d_lock); - cpu_relax(); - continue; - } - __dentry_kill(dentry); - dentry = parent; - } + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) + dentry = dentry_kill(dentry); } } @@ -2374,32 +2393,22 @@ EXPORT_SYMBOL(d_hash_and_lookup); void d_delete(struct dentry * dentry) { - struct inode *inode; - int isdir = 0; + struct inode *inode = dentry->d_inode; + int isdir = d_is_dir(dentry); + + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); /* * Are we the only user? */ -again: - spin_lock(&dentry->d_lock); - inode = dentry->d_inode; - isdir = S_ISDIR(inode->i_mode); if (dentry->d_lockref.count == 1) { - if (!spin_trylock(&inode->i_lock)) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - goto again; - } dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); - fsnotify_nameremove(dentry, isdir); - return; - } - - if (!d_unhashed(dentry)) + } else { __d_drop(dentry); - - spin_unlock(&dentry->d_lock); - + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + } fsnotify_nameremove(dentry, isdir); } EXPORT_SYMBOL(d_delete); @@ -2474,7 +2483,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, retry: rcu_read_lock(); - seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1; + seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { @@ -2495,8 +2504,14 @@ retry: rcu_read_unlock(); goto retry; } + + if (unlikely(seq & 1)) { + rcu_read_unlock(); + goto retry; + } + hlist_bl_lock(b); - if (unlikely(parent->d_inode->i_dir_seq != seq)) { + if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; @@ -2758,57 +2773,6 @@ static void copy_name(struct dentry *dentry, struct dentry *target) kfree_rcu(old_name, u.head); } -static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) -{ - /* - * XXXX: do we really need to take target->d_lock? - */ - if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent) - spin_lock(&target->d_parent->d_lock); - else { - if (d_ancestor(dentry->d_parent, target->d_parent)) { - spin_lock(&dentry->d_parent->d_lock); - spin_lock_nested(&target->d_parent->d_lock, - DENTRY_D_LOCK_NESTED); - } else { - spin_lock(&target->d_parent->d_lock); - spin_lock_nested(&dentry->d_parent->d_lock, - DENTRY_D_LOCK_NESTED); - } - } - if (target < dentry) { - spin_lock_nested(&target->d_lock, 2); - spin_lock_nested(&dentry->d_lock, 3); - } else { - spin_lock_nested(&dentry->d_lock, 2); - spin_lock_nested(&target->d_lock, 3); - } -} - -static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) -{ - if (target->d_parent != dentry->d_parent) - spin_unlock(&dentry->d_parent->d_lock); - if (target->d_parent != target) - spin_unlock(&target->d_parent->d_lock); - spin_unlock(&target->d_lock); - spin_unlock(&dentry->d_lock); -} - -/* - * When switching names, the actual string doesn't strictly have to - * be preserved in the target - because we're dropping the target - * anyway. As such, we can just do a simple memcpy() to copy over - * the new name before we switch, unless we are going to rehash - * it. Note that if we *do* unhash the target, we are not allowed - * to rehash it without giving it a new name/hash key - whether - * we swap or overwrite the names here, resulting name won't match - * the reality in filesystem; it's only there for d_path() purposes. - * Note that all of this is happening under rename_lock, so the - * any hash lookup seeing it in the middle of manipulations will - * be discarded anyway. So we do not care what happens to the hash - * key in that case. - */ /* * __d_move - move a dentry * @dentry: entry to move @@ -2823,15 +2787,34 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { + struct dentry *old_parent, *p; struct inode *dir = NULL; unsigned n; - if (!dentry->d_inode) - printk(KERN_WARNING "VFS: moving negative dcache entry\n"); - BUG_ON(d_ancestor(dentry, target)); + WARN_ON(!dentry->d_inode); + if (WARN_ON(dentry == target)) + return; + BUG_ON(d_ancestor(target, dentry)); + old_parent = dentry->d_parent; + p = d_ancestor(old_parent, target); + if (IS_ROOT(dentry)) { + BUG_ON(p); + spin_lock(&target->d_parent->d_lock); + } else if (!p) { + /* target is not a descendent of dentry->d_parent */ + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); + } else { + BUG_ON(p == dentry); + spin_lock(&old_parent->d_lock); + if (p != target) + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); - dentry_lock_for_move(dentry, target); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); @@ -2842,47 +2825,44 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ - /* ___d_drop does write_seqcount_barrier, but they're OK to nest. */ - ___d_drop(dentry); - ___d_drop(target); + if (!d_unhashed(dentry)) + ___d_drop(dentry); + if (!d_unhashed(target)) + ___d_drop(target); - /* Switch the names.. */ - if (exchange) - swap_names(dentry, target); - else + /* ... and switch them in the tree */ + dentry->d_parent = target->d_parent; + if (!exchange) { copy_name(dentry, target); - - /* rehash in new place(s) */ - __d_rehash(dentry); - if (exchange) - __d_rehash(target); - else target->d_hash.pprev = NULL; - - /* ... and switch them in the tree */ - if (IS_ROOT(dentry)) { - /* splicing a tree */ - dentry->d_flags |= DCACHE_RCUACCESS; - dentry->d_parent = target->d_parent; - target->d_parent = target; - list_del_init(&target->d_child); - list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + dentry->d_parent->d_lockref.count++; + if (dentry == old_parent) + dentry->d_flags |= DCACHE_RCUACCESS; + else + WARN_ON(!--old_parent->d_lockref.count); } else { - /* swapping two dentries */ - swap(dentry->d_parent, target->d_parent); + target->d_parent = old_parent; + swap_names(dentry, target); list_move(&target->d_child, &target->d_parent->d_subdirs); - list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); - if (exchange) - fsnotify_update_flags(target); - fsnotify_update_flags(dentry); + __d_rehash(target); + fsnotify_update_flags(target); } + list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + __d_rehash(dentry); + fsnotify_update_flags(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n); - dentry_unlock_for_move(dentry, target); + + if (dentry->d_parent != old_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (dentry != old_parent) + spin_unlock(&old_parent->d_lock); + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); } /* @@ -3030,12 +3010,14 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { + struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(inode, dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } + dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); @@ -3050,467 +3032,6 @@ out: } EXPORT_SYMBOL(d_splice_alias); -static int prepend(char **buffer, int *buflen, const char *str, int namelen) -{ - *buflen -= namelen; - if (*buflen < 0) - return -ENAMETOOLONG; - *buffer -= namelen; - memcpy(*buffer, str, namelen); - return 0; -} - -/** - * prepend_name - prepend a pathname in front of current buffer pointer - * @buffer: buffer pointer - * @buflen: allocated length of the buffer - * @name: name string and length qstr structure - * - * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to - * make sure that either the old or the new name pointer and length are - * fetched. However, there may be mismatch between length and pointer. - * The length cannot be trusted, we need to copy it byte-by-byte until - * the length is reached or a null byte is found. It also prepends "/" at - * the beginning of the name. The sequence number check at the caller will - * retry it again when a d_move() does happen. So any garbage in the buffer - * due to mismatched pointer and length will be discarded. - * - * Load acquire is needed to make sure that we see that terminating NUL. - */ -static int prepend_name(char **buffer, int *buflen, const struct qstr *name) -{ - const char *dname = smp_load_acquire(&name->name); /* ^^^ */ - u32 dlen = READ_ONCE(name->len); - char *p; - - *buflen -= dlen + 1; - if (*buflen < 0) - return -ENAMETOOLONG; - p = *buffer -= dlen + 1; - *p++ = '/'; - while (dlen--) { - char c = *dname++; - if (!c) - break; - *p++ = c; - } - return 0; -} - -/** - * prepend_path - Prepend path string to a buffer - * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry - * @buffer: pointer to the end of the buffer - * @buflen: pointer to buffer length - * - * The function will first try to write out the pathname without taking any - * lock other than the RCU read lock to make sure that dentries won't go away. - * It only checks the sequence number of the global rename_lock as any change - * in the dentry's d_seq will be preceded by changes in the rename_lock - * sequence number. If the sequence number had been changed, it will restart - * the whole pathname back-tracing sequence again by taking the rename_lock. - * In this case, there is no need to take the RCU read lock as the recursive - * parent pointer references will keep the dentry chain alive as long as no - * rename operation is performed. - */ -static int prepend_path(const struct path *path, - const struct path *root, - char **buffer, int *buflen) -{ - struct dentry *dentry; - struct vfsmount *vfsmnt; - struct mount *mnt; - int error = 0; - unsigned seq, m_seq = 0; - char *bptr; - int blen; - - rcu_read_lock(); -restart_mnt: - read_seqbegin_or_lock(&mount_lock, &m_seq); - seq = 0; - rcu_read_lock(); -restart: - bptr = *buffer; - blen = *buflen; - error = 0; - dentry = path->dentry; - vfsmnt = path->mnt; - mnt = real_mount(vfsmnt); - read_seqbegin_or_lock(&rename_lock, &seq); - while (dentry != root->dentry || vfsmnt != root->mnt) { - struct dentry * parent; - - if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - struct mount *parent = READ_ONCE(mnt->mnt_parent); - /* Escaped? */ - if (dentry != vfsmnt->mnt_root) { - bptr = *buffer; - blen = *buflen; - error = 3; - break; - } - /* Global root? */ - if (mnt != parent) { - dentry = READ_ONCE(mnt->mnt_mountpoint); - mnt = parent; - vfsmnt = &mnt->mnt; - continue; - } - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; - break; - } - parent = dentry->d_parent; - prefetch(parent); - error = prepend_name(&bptr, &blen, &dentry->d_name); - if (error) - break; - - dentry = parent; - } - if (!(seq & 1)) - rcu_read_unlock(); - if (need_seqretry(&rename_lock, seq)) { - seq = 1; - goto restart; - } - done_seqretry(&rename_lock, seq); - - if (!(m_seq & 1)) - rcu_read_unlock(); - if (need_seqretry(&mount_lock, m_seq)) { - m_seq = 1; - goto restart_mnt; - } - done_seqretry(&mount_lock, m_seq); - - if (error >= 0 && bptr == *buffer) { - if (--blen < 0) - error = -ENAMETOOLONG; - else - *--bptr = '/'; - } - *buffer = bptr; - *buflen = blen; - return error; -} - -/** - * __d_path - return the path of a dentry - * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry - * @buf: buffer to return value in - * @buflen: buffer length - * - * Convert a dentry into an ASCII path name. - * - * Returns a pointer into the buffer or an error code if the - * path was too long. - * - * "buflen" should be positive. - * - * If the path is not reachable from the supplied root, return %NULL. - */ -char *__d_path(const struct path *path, - const struct path *root, - char *buf, int buflen) -{ - char *res = buf + buflen; - int error; - - prepend(&res, &buflen, "\0", 1); - error = prepend_path(path, root, &res, &buflen); - - if (error < 0) - return ERR_PTR(error); - if (error > 0) - return NULL; - return res; -} - -char *d_absolute_path(const struct path *path, - char *buf, int buflen) -{ - struct path root = {}; - char *res = buf + buflen; - int error; - - prepend(&res, &buflen, "\0", 1); - error = prepend_path(path, &root, &res, &buflen); - - if (error > 1) - error = -EINVAL; - if (error < 0) - return ERR_PTR(error); - return res; -} - -/* - * same as __d_path but appends "(deleted)" for unlinked files. - */ -static int path_with_deleted(const struct path *path, - const struct path *root, - char **buf, int *buflen) -{ - prepend(buf, buflen, "\0", 1); - if (d_unlinked(path->dentry)) { - int error = prepend(buf, buflen, " (deleted)", 10); - if (error) - return error; - } - - return prepend_path(path, root, buf, buflen); -} - -static int prepend_unreachable(char **buffer, int *buflen) -{ - return prepend(buffer, buflen, "(unreachable)", 13); -} - -static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) -{ - unsigned seq; - - do { - seq = read_seqcount_begin(&fs->seq); - *root = fs->root; - } while (read_seqcount_retry(&fs->seq, seq)); -} - -/** - * d_path - return the path of a dentry - * @path: path to report - * @buf: buffer to return value in - * @buflen: buffer length - * - * Convert a dentry into an ASCII path name. If the entry has been deleted - * the string " (deleted)" is appended. Note that this is ambiguous. - * - * Returns a pointer into the buffer or an error code if the path was - * too long. Note: Callers should use the returned pointer, not the passed - * in buffer, to use the name! The implementation often starts at an offset - * into the buffer, and may leave 0 bytes at the start. - * - * "buflen" should be positive. - */ -char *d_path(const struct path *path, char *buf, int buflen) -{ - char *res = buf + buflen; - struct path root; - int error; - - /* - * We have various synthetic filesystems that never get mounted. On - * these filesystems dentries are never used for lookup purposes, and - * thus don't need to be hashed. They also don't need a name until a - * user wants to identify the object in /proc/pid/fd/. The little hack - * below allows us to generate a name for these objects on demand: - * - * Some pseudo inodes are mountable. When they are mounted - * path->dentry == path->mnt->mnt_root. In that case don't call d_dname - * and instead have d_path return the mounted path. - */ - if (path->dentry->d_op && path->dentry->d_op->d_dname && - (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) - return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - - rcu_read_lock(); - get_fs_root_rcu(current->fs, &root); - error = path_with_deleted(path, &root, &res, &buflen); - rcu_read_unlock(); - - if (error < 0) - res = ERR_PTR(error); - return res; -} -EXPORT_SYMBOL(d_path); - -/* - * Helper function for dentry_operations.d_dname() members - */ -char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, - const char *fmt, ...) -{ - va_list args; - char temp[64]; - int sz; - - va_start(args, fmt); - sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; - va_end(args); - - if (sz > sizeof(temp) || sz > buflen) - return ERR_PTR(-ENAMETOOLONG); - - buffer += buflen - sz; - return memcpy(buffer, temp, sz); -} - -char *simple_dname(struct dentry *dentry, char *buffer, int buflen) -{ - char *end = buffer + buflen; - /* these dentries are never renamed, so d_lock is not needed */ - if (prepend(&end, &buflen, " (deleted)", 11) || - prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || - prepend(&end, &buflen, "/", 1)) - end = ERR_PTR(-ENAMETOOLONG); - return end; -} -EXPORT_SYMBOL(simple_dname); - -/* - * Write full pathname from the root of the filesystem into the buffer. - */ -static char *__dentry_path(struct dentry *d, char *buf, int buflen) -{ - struct dentry *dentry; - char *end, *retval; - int len, seq = 0; - int error = 0; - - if (buflen < 2) - goto Elong; - - rcu_read_lock(); -restart: - dentry = d; - end = buf + buflen; - len = buflen; - prepend(&end, &len, "\0", 1); - /* Get '/' right */ - retval = end-1; - *retval = '/'; - read_seqbegin_or_lock(&rename_lock, &seq); - while (!IS_ROOT(dentry)) { - struct dentry *parent = dentry->d_parent; - - prefetch(parent); - error = prepend_name(&end, &len, &dentry->d_name); - if (error) - break; - - retval = end; - dentry = parent; - } - if (!(seq & 1)) - rcu_read_unlock(); - if (need_seqretry(&rename_lock, seq)) { - seq = 1; - goto restart; - } - done_seqretry(&rename_lock, seq); - if (error) - goto Elong; - return retval; -Elong: - return ERR_PTR(-ENAMETOOLONG); -} - -char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) -{ - return __dentry_path(dentry, buf, buflen); -} -EXPORT_SYMBOL(dentry_path_raw); - -char *dentry_path(struct dentry *dentry, char *buf, int buflen) -{ - char *p = NULL; - char *retval; - - if (d_unlinked(dentry)) { - p = buf + buflen; - if (prepend(&p, &buflen, "//deleted", 10) != 0) - goto Elong; - buflen++; - } - retval = __dentry_path(dentry, buf, buflen); - if (!IS_ERR(retval) && p) - *p = '/'; /* restore '/' overriden with '\0' */ - return retval; -Elong: - return ERR_PTR(-ENAMETOOLONG); -} - -static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, - struct path *pwd) -{ - unsigned seq; - - do { - seq = read_seqcount_begin(&fs->seq); - *root = fs->root; - *pwd = fs->pwd; - } while (read_seqcount_retry(&fs->seq, seq)); -} - -/* - * NOTE! The user-level library version returns a - * character pointer. The kernel system call just - * returns the length of the buffer filled (which - * includes the ending '\0' character), or a negative - * error value. So libc would do something like - * - * char *getcwd(char * buf, size_t size) - * { - * int retval; - * - * retval = sys_getcwd(buf, size); - * if (retval >= 0) - * return buf; - * errno = -retval; - * return NULL; - * } - */ -SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) -{ - int error; - struct path pwd, root; - char *page = __getname(); - - if (!page) - return -ENOMEM; - - rcu_read_lock(); - get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); - - error = -ENOENT; - if (!d_unlinked(pwd.dentry)) { - unsigned long len; - char *cwd = page + PATH_MAX; - int buflen = PATH_MAX; - - prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &root, &cwd, &buflen); - rcu_read_unlock(); - - if (error < 0) - goto out; - - /* Unreachable from current root */ - if (error > 0) { - error = prepend_unreachable(&cwd, &buflen); - if (error) - goto out; - } - - error = -ERANGE; - len = PATH_MAX + page - cwd; - if (len <= size) { - error = len; - if (copy_to_user(buf, cwd, len)) - error = -EFAULT; - } - } else { - rcu_read_unlock(); - } - -out: - __putname(page); - return error; -} - /* * Test whether new_dentry is a subdirectory of old_dentry. * @@ -3574,6 +3095,8 @@ void d_genocide(struct dentry *parent) d_walk(parent, parent, d_genocide_kill, NULL); } +EXPORT_SYMBOL(d_genocide); + void d_tmpfile(struct dentry *dentry, struct inode *inode) { inode_dec_link_count(inode); @@ -3653,8 +3176,6 @@ static void __init dcache_init(void) struct kmem_cache *names_cachep __read_mostly; EXPORT_SYMBOL(names_cachep); -EXPORT_SYMBOL(d_genocide); - void __init vfs_caches_init_early(void) { int i; diff --git a/fs/dcookies.c b/fs/dcookies.c index 0d0461cf2431..57bc96435feb 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -146,7 +146,7 @@ out: /* And here is where the userspace process can look up the cookie value * to retrieve the path. */ -SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) +static int do_lookup_dcookie(u64 cookie64, char __user *buf, size_t len) { unsigned long cookie = (unsigned long)cookie64; int err = -EINVAL; @@ -203,13 +203,18 @@ out: return err; } +SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) +{ + return do_lookup_dcookie(cookie64, buf, len); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN - return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); + return do_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); #else - return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); + return do_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); #endif } #endif diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 63a998c3f252..13b01351dd1c 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -270,10 +270,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); - inode_unlock(d_inode(parent)); - + dentry = lookup_one_len_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; if (!d_really_is_positive(dentry)) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index e31d6ed3ec32..e072e955ce33 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -138,10 +138,6 @@ static int devpts_ptmx_path(struct path *path) struct super_block *sb; int err; - /* Has the devpts filesystem already been found? */ - if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC) - return 0; - /* Is a devpts filesystem at "pts" in the same directory? */ err = path_pts(path); if (err) @@ -156,25 +152,53 @@ static int devpts_ptmx_path(struct path *path) return 0; } +/* + * Try to find a suitable devpts filesystem. We support the following + * scenarios: + * - The ptmx device node is located in the same directory as the devpts + * mount where the pts device nodes are located. + * This is e.g. the case when calling open on the /dev/pts/ptmx device + * node when the devpts filesystem is mounted at /dev/pts. + * - The ptmx device node is located outside the devpts filesystem mount + * where the pts device nodes are located. For example, the ptmx device + * is a symlink, separate device node, or bind-mount. + * A supported scenario is bind-mounting /dev/pts/ptmx to /dev/ptmx and + * then calling open on /dev/ptmx. In this case a suitable pts + * subdirectory can be found in the common parent directory /dev of the + * devpts mount and the ptmx bind-mount, after resolving the /dev/ptmx + * bind-mount. + * If no suitable pts subdirectory can be found this function will fail. + * This is e.g. the case when bind-mounting /dev/pts/ptmx to /ptmx. + */ struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) { struct path path; - int err; + int err = 0; path = filp->f_path; path_get(&path); - err = devpts_ptmx_path(&path); + /* Walk upward while the start point is a bind mount of + * a single file. + */ + while (path.mnt->mnt_root == path.dentry) + if (follow_up(&path) == 0) + break; + + /* devpts_ptmx_path() finds a devpts fs or returns an error. */ + if ((path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) || + (DEVPTS_SB(path.mnt->mnt_sb) != fsi)) + err = devpts_ptmx_path(&path); dput(path.dentry); - if (err) { - mntput(path.mnt); - return ERR_PTR(err); - } - if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) { - mntput(path.mnt); - return ERR_PTR(-ENODEV); + if (!err) { + if (DEVPTS_SB(path.mnt->mnt_sb) == fsi) + return path.mnt; + + err = -ENODEV; } - return path.mnt; + + mntput(path.mnt); + return ERR_PTR(err); } struct pts_fs_info *devpts_acquire(struct file *filp) @@ -182,15 +206,19 @@ struct pts_fs_info *devpts_acquire(struct file *filp) struct pts_fs_info *result; struct path path; struct super_block *sb; - int err; path = filp->f_path; path_get(&path); - err = devpts_ptmx_path(&path); - if (err) { - result = ERR_PTR(err); - goto out; + /* Has the devpts filesystem already been found? */ + if (path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) { + int err; + + err = devpts_ptmx_path(&path); + if (err) { + result = ERR_PTR(err); + goto out; + } } /* diff --git a/fs/direct-io.c b/fs/direct-io.c index a0ca9e48e993..874607bb6e02 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -315,8 +315,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) dio_warn_stale_pagecache(dio->iocb->ki_filp); } - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) - inode_dio_end(dio->inode); + inode_dio_end(dio->inode); if (flags & DIO_COMPLETE_ASYNC) { /* @@ -1178,9 +1177,9 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - size_t count = iov_iter_count(iter); + const size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; - loff_t end = offset + count; + const loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1201,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } /* watch out for a 0 len io from a tricksy fs */ - if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) + if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1252,8 +1251,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (is_sync_kiocb(iocb)) dio->is_async = false; - else if (!(dio->flags & DIO_ASYNC_EXTEND) && - iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) + else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) dio->is_async = false; else dio->is_async = true; @@ -1274,8 +1272,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host)) + if (iocb->ki_flags & IOCB_DSYNC) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* @@ -1298,8 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) - inode_dio_begin(inode); + inode_dio_begin(inode); retval = 0; sdio.blkbits = blkbits; @@ -1319,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; - sdio.final_block_in_request = - (offset + iov_iter_count(iter)) >> blkbits; + sdio.final_block_in_request = end >> blkbits; /* * In case of non-aligned buffers, we may need 2 more diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cff79ea0c01d..5243989a60cc 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -482,7 +482,6 @@ static void lowcomms_error_report(struct sock *sk) { struct connection *con; struct sockaddr_storage saddr; - int buflen; void (*orig_report)(struct sock *) = NULL; read_lock_bh(&sk->sk_callback_lock); @@ -492,7 +491,7 @@ static void lowcomms_error_report(struct sock *sk) orig_report = listen_sock.sk_error_report; if (con->sock == NULL || - kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { + kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), @@ -757,8 +756,8 @@ static int tcp_accept_from_sock(struct connection *con) /* Get the connected socket's peer */ memset(&peeraddr, 0, sizeof(peeraddr)); - if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, - &len, 2)) { + len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); + if (len < 0) { result = -ECONNABORTED; goto accept_err; } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 5f22e74bbade..8e568428c88b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -8,6 +8,7 @@ */ #include <linux/efi.h> +#include <linux/delay.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mount.h> @@ -74,6 +75,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, ssize_t size = 0; int err; + while (!__ratelimit(&file->f_cred->user->ratelimit)) { + if (!msleep_interruptible(50)) + return -EINTR; + } + err = efivar_entry_size(var, &datasize); /* diff --git a/fs/eventfd.c b/fs/eventfd.c index 012f5bd46dfa..08d3bd602f73 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -380,7 +380,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); -SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; int fd; @@ -409,8 +409,13 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) return fd; } +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +{ + return do_eventfd(count, flags); +} + SYSCALL_DEFINE1(eventfd, unsigned int, count) { - return sys_eventfd2(count, 0); + return do_eventfd(count, 0); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0f3494ed3ed0..602ca4285b2e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1936,7 +1936,7 @@ static void clear_tfile_check_list(void) /* * Open an eventpoll file descriptor. */ -SYSCALL_DEFINE1(epoll_create1, int, flags) +static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; @@ -1979,12 +1979,17 @@ out_free_ep: return error; } +SYSCALL_DEFINE1(epoll_create1, int, flags) +{ + return do_epoll_create(flags); +} + SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; - return sys_epoll_create1(0); + return do_epoll_create(0); } /* @@ -2148,8 +2153,8 @@ error_return: * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ -SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, - int, maxevents, int, timeout) +static int do_epoll_wait(int epfd, struct epoll_event __user *events, + int maxevents, int timeout) { int error; struct fd f; @@ -2190,6 +2195,12 @@ error_fput: return error; } +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + return do_epoll_wait(epfd, events, maxevents, timeout); +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). @@ -2214,7 +2225,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, set_current_blocked(&ksigmask); } - error = sys_epoll_wait(epfd, events, maxevents, timeout); + error = do_epoll_wait(epfd, events, maxevents, timeout); /* * If we changed the signal mask, we need to restore the original one. @@ -2257,7 +2268,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, set_current_blocked(&ksigmask); } - err = sys_epoll_wait(epfd, events, maxevents, timeout); + err = do_epoll_wait(epfd, events, maxevents, timeout); /* * If we changed the signal mask, we need to restore the original one. diff --git a/fs/exec.c b/fs/exec.c index 7eb8d21bcab9..a919a827d181 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -895,13 +895,13 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0) return -EINVAL; - ret = security_kernel_read_file(file, id); + ret = deny_write_access(file); if (ret) return ret; - ret = deny_write_access(file); + ret = security_kernel_read_file(file, id); if (ret) - return ret; + goto out; i_size = i_size_read(file_inode(file)); if (max_size > 0 && i_size > max_size) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7666c065b96f..de1694512f1f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -827,7 +827,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) unsigned long logic_sb_block; unsigned long offset = 0; unsigned long def_mount_opts; - long ret = -EINVAL; + long ret = -ENOMEM; int blocksize = BLOCK_SIZE; int db_count; int i, j; @@ -835,7 +835,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) int err; struct ext2_mount_options opts; - err = -ENOMEM; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) goto failed; @@ -851,6 +850,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_daxdev = dax_dev; spin_lock_init(&sbi->s_lock); + ret = -EINVAL; /* * See what the current blocksize for the device is, and diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9b3e0a83526..a33d8fb1bf2a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -243,8 +243,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); - ext4_group_desc_csum_set(sb, block_group, gdp); return 0; } @@ -340,20 +338,25 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize) + return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), EXT4_B2C(sbi, offset)); @@ -419,6 +422,7 @@ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; @@ -427,6 +431,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid block bitmap block %llu in " + "block_group %u", bitmap_blk, block_group); + return ERR_PTR(-EFSCORRUPTED); + } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_error(sb, "Cannot get buffer for block bitmap - " @@ -448,6 +458,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) { diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index da87cf757f7d..e2902d394f1b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -365,13 +365,15 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); - loff_t htree_max = ext4_get_htree_eof(file); + loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) - return generic_file_llseek_size(file, offset, whence, + ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else - return ext4_llseek(file, offset, whence); + ret = ext4_llseek(file, offset, whence); + file->f_version = inode_peek_iversion(inode) - 1; + return ret; } /* diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3241475a1733..a42e71203e53 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1522,8 +1522,6 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ - EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read - nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ @@ -3181,21 +3179,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* - * Disable DIO read nolock optimization, so new dioreaders will be forced - * to grab i_mutex - */ -static inline void ext4_inode_block_unlocked_dio(struct inode *inode) -{ - ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); - smp_mb(); -} -static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) -{ - smp_mb(); - ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); -} - #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) /* For ioend & aio unwritten conversion wait queues */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 2d593201cf7a..7c70b08d104c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -166,13 +166,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); if (ext4_handle_valid(handle)) { - struct super_block *sb; - - sb = handle->h_transaction->t_journal->j_private; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) { - jbd2_journal_abort_handle(handle); - return -EIO; - } err = jbd2_journal_get_write_access(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 054416e9d827..0a7315961bac 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4796,7 +4796,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= EXT4_GET_BLOCKS_KEEP_SIZE; /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* Preallocate the range including the unaligned edges */ @@ -4807,7 +4806,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags); if (ret) - goto out_dio; + goto out_mutex; } @@ -4824,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { up_write(&EXT4_I(inode)->i_mmap_sem); - goto out_dio; + goto out_mutex; } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); @@ -4834,10 +4833,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags); up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) - goto out_dio; + goto out_mutex; } if (!partial_begin && !partial_end) - goto out_dio; + goto out_mutex; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4850,7 +4849,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_dio; + goto out_mutex; } inode->i_mtime = inode->i_ctime = current_time(inode); @@ -4875,8 +4874,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, ext4_handle_sync(handle); ext4_journal_stop(handle); -out_dio: - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -4964,11 +4961,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5485,7 +5480,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } /* Wait for existing dio to complete */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -5562,7 +5556,6 @@ out_stop: ext4_journal_stop(handle); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -5635,7 +5628,6 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) } /* Wait for existing dio to complete */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -5737,7 +5729,6 @@ out_stop: ext4_journal_stop(handle); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -5751,7 +5742,7 @@ out_mutex: * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap - * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other @@ -5765,7 +5756,7 @@ out_mutex: */ int ext4_swap_extents(handle_t *handle, struct inode *inode1, - struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) { struct ext4_ext_path *path1 = NULL; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7830d28df331..df92e3ec9913 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -66,44 +66,6 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } -/* Initializes an uninitialized inode bitmap */ -static int ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - struct ext4_group_info *grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks and inodes use to prevent - * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return -EFSBADCRC; - } - - memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); - ext4_group_desc_csum_set(sb, block_group, gdp); - - return 0; -} - void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { @@ -160,6 +122,7 @@ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; int err; @@ -169,6 +132,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid inode bitmap blk %llu in " + "block_group %u", bitmap_blk, block_group); + return ERR_PTR(-EFSCORRUPTED); + } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_error(sb, "Cannot read inode bitmap - " @@ -187,17 +156,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - err = ext4_init_inode_bitmap(sb, bh, block_group, desc); + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) { - ext4_error(sb, "Failed to init inode bitmap for group " - "%u: %d", block_group, err); - goto out; - } return bh; } ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c94780075b04..129205028300 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2694,15 +2694,6 @@ out: return err; } -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = ext4_writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; -} - static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2740,11 +2731,7 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; if (ext4_should_journal_data(inode)) { - struct blk_plug plug; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __writepage, mapping); - blk_finish_plug(&plug); + ret = generic_writepages(mapping, wbc); goto out_writepages; } @@ -3524,7 +3511,7 @@ retry: iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; - iomap->offset = first_block << blkbits; + iomap->offset = (u64)first_block << blkbits; iomap->length = (u64)map.m_len << blkbits; if (ret == 0) { @@ -3669,7 +3656,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) int orphan = 0; handle_t *handle; - if (final_size > inode->i_size) { + if (final_size > inode->i_size || final_size > ei->i_disksize) { /* Credits for sb + inode write */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { @@ -3682,7 +3669,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) goto out; } orphan = 1; - ei->i_disksize = inode->i_size; + ext4_update_i_disksize(inode, inode->i_size); ext4_journal_stop(handle); } @@ -3789,9 +3776,10 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) ext4_orphan_del(handle, inode); if (ret > 0) { loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); + if (end > inode->i_size || end > ei->i_disksize) { + ext4_update_i_disksize(inode, end); + if (end > inode->i_size) + i_size_write(inode, end); /* * We're going to return a positive `ret' * here due to non-zero-length I/O, so there's @@ -4251,7 +4239,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -4324,7 +4311,6 @@ out_stop: ext4_journal_stop(handle); out_dio: up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -4746,6 +4732,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); + if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { + EXT4_ERROR_INODE(inode, "root inode unallocated"); + ret = -EFSCORRUPTED; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > @@ -5032,12 +5024,12 @@ static int other_inode_match(struct inode * inode, unsigned long ino, if ((inode->i_ino != ino) || (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + I_DIRTY_INODE)) || ((inode->i_state & I_DIRTY_TIME) == 0)) return 0; spin_lock(&inode->i_lock); if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && + I_DIRTY_INODE)) == 0) && (inode->i_state & I_DIRTY_TIME)) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -5506,9 +5498,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) */ if (orphan) { if (!ext4_should_journal_data(inode)) { - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); } else ext4_wait_for_tail_page_commit(inode); } @@ -5999,7 +5989,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return -EROFS; /* Wait for all existing dio workers */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -6015,7 +6004,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); return err; } } @@ -6038,7 +6026,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (err < 0) { jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); - ext4_inode_resume_unlocked_dio(inode); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -6050,7 +6037,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7e99ad02f1ba..a7074115d6f6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -124,8 +124,6 @@ static long swap_inode_boot_loader(struct super_block *sb, truncate_inode_pages(&inode_bl->i_data, 0); /* Wait for all existing dio workers */ - ext4_inode_block_unlocked_dio(inode); - ext4_inode_block_unlocked_dio(inode_bl); inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -186,8 +184,6 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_double_up_write_data_sem(inode, inode_bl); journal_err_out: - ext4_inode_resume_unlocked_dio(inode); - ext4_inode_resume_unlocked_dio(inode_bl); unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; @@ -481,6 +477,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); + trace_ext4_shutdown(sb, flags); switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: @@ -492,15 +489,13 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { (void) ext4_force_commit(sb); - jbd2_journal_abort(sbi->s_journal, 0); + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); } break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { - msleep(100); - jbd2_journal_abort(sbi->s_journal, 0); - } + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); break; default: return -EINVAL; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b96e4bd3b3ec..8e17efdcbf11 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -601,8 +601,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ - ext4_inode_block_unlocked_dio(orig_inode); - ext4_inode_block_unlocked_dio(donor_inode); inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); @@ -693,8 +691,6 @@ out: ext4_ext_drop_refs(path); kfree(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); - ext4_inode_resume_unlocked_dio(orig_inode); - ext4_inode_resume_unlocked_dio(donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 39bf464c35f1..185f7e61f4cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -101,15 +101,13 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * i_data_sem (rw) * * truncate: - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> - * i_mmap_rwsem (w) -> page lock - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> - * transaction start -> i_data_sem (rw) + * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> + * i_data_sem (rw) * * direct IO: - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> - * transaction start -> i_data_sem (rw) + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) @@ -448,6 +446,7 @@ void __ext4_error(struct super_block *sb, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; + trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -472,6 +471,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; + trace_ext4_error(inode->i_sb, function, line); es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); if (ext4_error_ratelimit(inode->i_sb)) { @@ -507,6 +507,7 @@ void __ext4_error_file(struct file *file, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; + trace_ext4_error(inode->i_sb, function, line); es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { @@ -719,6 +720,7 @@ __acquires(bitlock) if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; + trace_ext4_error(sb, function, line); es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); @@ -2019,7 +2021,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + int def_errors, def_mount_opt = sbi->s_def_mount_opt; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; @@ -2034,7 +2036,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || (m->flags & MOPT_CLEAR_ERR)) continue; - if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ if ((want_set && (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || @@ -2068,7 +2070,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); - if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (nodefs || EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) @@ -2081,7 +2084,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); - if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) @@ -2333,6 +2336,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); + if (!sb_rdonly(sb)) + return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2345,6 +2350,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); + if (!sb_rdonly(sb)) + return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2357,6 +2364,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); + if (!sb_rdonly(sb)) + return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { @@ -3490,15 +3499,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (ext4_has_feature_metadata_csum(sb) || - ext4_has_feature_ea_inode(sb)) { - sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(sbi->s_chksum_driver)) { - ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); - ret = PTR_ERR(sbi->s_chksum_driver); - sbi->s_chksum_driver = NULL; - goto failed_mount; - } + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + ret = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto failed_mount; } /* Check superblock checksum */ @@ -3660,6 +3666,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { + /* + * If we're probing be silent, if this looks like + * it's actually an ext[34] filesystem. + */ + if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) + goto failed_mount; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); goto failed_mount; @@ -3671,6 +3683,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { + /* + * If we're probing be silent, if this looks like + * it's actually an ext4 filesystem. + */ + if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) + goto failed_mount; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); goto failed_mount; @@ -4094,10 +4112,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); - else + sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + } else { set_opt(sb, JOURNAL_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + } break; case EXT4_MOUNT_ORDERED_DATA: diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1205261f130c..9ebd26c957c2 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -49,8 +49,7 @@ struct ext4_attr { } u; }; -static ssize_t session_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) +static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; @@ -61,8 +60,7 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a, sbi->s_sectors_written_start) >> 1); } -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) +static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; @@ -74,8 +72,7 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } -static ssize_t inode_readahead_blks_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, +static ssize_t inode_readahead_blks_store(struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long t; @@ -92,8 +89,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, return count; } -static ssize_t reserved_clusters_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, +static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long long val; @@ -109,8 +105,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, return count; } -static ssize_t trigger_test_error(struct ext4_attr *a, - struct ext4_sb_info *sbi, +static ssize_t trigger_test_error(struct ext4_sb_info *sbi, const char *buf, size_t count) { int len = count; @@ -268,9 +263,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, (s64) EXT4_C2B(sbi, percpu_counter_sum(&sbi->s_dirtyclusters_counter))); case attr_session_write_kbytes: - return session_write_kbytes_show(a, sbi, buf); + return session_write_kbytes_show(sbi, buf); case attr_lifetime_write_kbytes: - return lifetime_write_kbytes_show(a, sbi, buf); + return lifetime_write_kbytes_show(sbi, buf); case attr_reserved_clusters: return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long) @@ -306,7 +301,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj, switch (a->attr_id) { case attr_reserved_clusters: - return reserved_clusters_store(a, sbi, buf, len); + return reserved_clusters_store(sbi, buf, len); case attr_pointer_ui: if (!ptr) return 0; @@ -316,9 +311,9 @@ static ssize_t ext4_attr_store(struct kobject *kobj, *((unsigned int *) ptr) = t; return len; case attr_inode_readahead: - return inode_readahead_blks_store(a, sbi, buf, len); + return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: - return trigger_test_error(a, sbi, buf, len); + return trigger_test_error(sbi, buf, len); } return 0; } @@ -330,13 +325,6 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -static void ext4_kset_release(struct kobject *kobj) -{ - struct kset *kset = container_of(kobj, struct kset, kobj); - - kfree(kset); -} - static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -348,19 +336,14 @@ static struct kobj_type ext4_sb_ktype = { .release = ext4_sb_release, }; -static struct kobj_type ext4_ktype = { - .sysfs_ops = &ext4_attr_ops, - .release = ext4_kset_release, -}; - -static struct kset *ext4_kset; - static struct kobj_type ext4_feat_ktype = { .default_attrs = ext4_feat_attrs, .sysfs_ops = &ext4_attr_ops, .release = (void (*)(struct kobject *))kfree, }; +static struct kobject *ext4_root; + static struct kobject *ext4_feat; #define PROC_FILE_SHOW_DEFN(name) \ @@ -398,9 +381,8 @@ int ext4_register_sysfs(struct super_block *sb) const struct ext4_proc_files *p; int err; - sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL, + err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, "%s", sb->s_id); if (err) { kobject_put(&sbi->s_kobj); @@ -436,26 +418,18 @@ int __init ext4_init_sysfs(void) { int ret; - ext4_kset = kzalloc(sizeof(*ext4_kset), GFP_KERNEL); - if (!ext4_kset) + ext4_root = kobject_create_and_add("ext4", fs_kobj); + if (!ext4_root) return -ENOMEM; - kobject_set_name(&ext4_kset->kobj, "ext4"); - ext4_kset->kobj.parent = fs_kobj; - ext4_kset->kobj.ktype = &ext4_ktype; - ret = kset_register(ext4_kset); - if (ret) - goto kset_err; - ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL); if (!ext4_feat) { ret = -ENOMEM; - goto kset_err; + goto root_err; } - ext4_feat->kset = ext4_kset; ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype, - NULL, "features"); + ext4_root, "features"); if (ret) goto feat_err; @@ -464,17 +438,19 @@ int __init ext4_init_sysfs(void) feat_err: kobject_put(ext4_feat); -kset_err: - kset_unregister(ext4_kset); - ext4_kset = NULL; + ext4_feat = NULL; +root_err: + kobject_put(ext4_root); + ext4_root = NULL; return ret; } void ext4_exit_sysfs(void) { kobject_put(ext4_feat); - kset_unregister(ext4_kset); - ext4_kset = NULL; + ext4_feat = NULL; + kobject_put(ext4_root); + ext4_root = NULL; remove_proc_entry(proc_dirname, NULL); ext4_proc_root = NULL; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 63656dbafdc4..499cb4b1fbd2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -195,10 +195,13 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, /* Check the values */ while (!IS_LAST_ENTRY(entry)) { - if (entry->e_value_size != 0 && - entry->e_value_inum == 0) { + u32 size = le32_to_cpu(entry->e_value_size); + + if (size > EXT4_XATTR_SIZE_MAX) + return -EFSCORRUPTED; + + if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); - u32 size = le32_to_cpu(entry->e_value_size); void *value; /* @@ -222,25 +225,36 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, } static inline int -ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) +__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, + const char *function, unsigned int line) { - int error; + int error = -EFSCORRUPTED; if (buffer_verified(bh)) return 0; if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EFSCORRUPTED; + goto errout; + error = -EFSBADCRC; if (!ext4_xattr_block_csum_verify(inode, bh)) - return -EFSBADCRC; + goto errout; error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); - if (!error) +errout: + if (error) + __ext4_error_inode(inode, function, line, 0, + "corrupted xattr block %llu", + (unsigned long long) bh->b_blocknr); + else set_buffer_verified(bh); return error; } +#define ext4_xattr_check_block(inode, bh) \ + __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) + + static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) @@ -262,18 +276,22 @@ errout: __xattr_check_inode((inode), (header), (end), __func__, __LINE__) static int -ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, int sorted) +xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, + void *end, int name_index, const char *name, int sorted) { - struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); - entry = *pentry; - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { + next = EXT4_XATTR_NEXT(entry); + if ((void *) next >= end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + return -EFSCORRUPTED; + } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; @@ -495,6 +513,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; + void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); @@ -511,20 +530,20 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(inode, bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; + error = ext4_xattr_check_block(inode, bh); + if (error) goto cleanup; - } ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, 1); + end = bh->b_data + bh->b_size; + error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); + error = -ERANGE; + if (unlikely(size > EXT4_XATTR_SIZE_MAX)) + goto cleanup; if (buffer) { - error = -ERANGE; if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { @@ -533,8 +552,12 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; } else { - memcpy(buffer, bh->b_data + - le16_to_cpu(entry->e_value_offs), size); + u16 offset = le16_to_cpu(entry->e_value_offs); + void *p = bh->b_data + offset; + + if (unlikely(p + size > end)) + goto cleanup; + memcpy(buffer, p, size); } } error = size; @@ -568,12 +591,14 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; entry = IFIRST(header); - error = ext4_xattr_find_entry(&entry, name_index, name, 0); + error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); + error = -ERANGE; + if (unlikely(size > EXT4_XATTR_SIZE_MAX)) + goto cleanup; if (buffer) { - error = -ERANGE; if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { @@ -582,8 +607,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; } else { - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + u16 offset = le16_to_cpu(entry->e_value_offs); + void *p = (void *)IFIRST(header) + offset; + + if (unlikely(p + size > end)) + goto cleanup; + memcpy(buffer, p, size); } } error = size; @@ -676,12 +705,9 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(inode, bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; + error = ext4_xattr_check_block(inode, bh); + if (error) goto cleanup; - } ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); @@ -808,10 +834,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; } - if (ext4_xattr_check_block(inode, bh)) { - ret = -EFSCORRUPTED; + ret = ext4_xattr_check_block(inode, bh); + if (ret) goto out; - } for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -1793,19 +1818,16 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext4_xattr_check_block(inode, bs->bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; + error = ext4_xattr_check_block(inode, bs->bh); + if (error) goto cleanup; - } /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; - error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, 1); + error = xattr_find_entry(inode, &bs->s.here, bs->s.end, + i->name_index, i->name, 1); if (error && error != -ENODATA) goto cleanup; bs->s.not_found = error; @@ -2164,8 +2186,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, if (error) return error; /* Find the named attribute. */ - error = ext4_xattr_find_entry(&is->s.here, i->name_index, - i->name, 0); + error = xattr_find_entry(inode, &is->s.here, is->s.end, + i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; @@ -2721,13 +2743,9 @@ retry: error = -EIO; if (!bh) goto cleanup; - if (ext4_xattr_check_block(inode, bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; - brelse(bh); + error = ext4_xattr_check_block(inode, bh); + if (error) goto cleanup; - } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; @@ -2884,11 +2902,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, goto cleanup; } error = ext4_xattr_check_block(inode, bh); - if (error) { - EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", - EXT4_I(inode)->i_file_acl, error); + if (error) goto cleanup; - } if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index dd54c4f995c8..f39cad2abe2a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -71,6 +71,17 @@ struct ext4_xattr_entry { #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* + * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking + * for file system consistency errors, we use a somewhat bigger value. + * This allows XATTR_SIZE_MAX to grow in the future, but by using this + * instead of INT_MAX for certain consistency checks, we don't need to + * worry about arithmetic overflows. (Actually XATTR_SIZE_MAX is + * defined in include/uapi/linux/limits.h, so changing it is going + * not going to be trivial....) + */ +#define EXT4_XATTR_SIZE_MAX (1 << 24) + +/* * The minimum size of EA value when you start storing it in an external inode * size of block - size of header - size of 1 entry - 4 null bytes */ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 512dca8abc7d..bf779461df13 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -68,6 +68,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, + .is_meta = is_meta, }; if (unlikely(!is_meta)) @@ -162,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, .in_list = false, + .is_meta = (type != META_POR), }; struct blk_plug plug; @@ -569,13 +571,8 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct node_info ni; int err = acquire_orphan_inode(sbi); - if (err) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", - __func__, ino); - return err; - } + if (err) + goto err_out; __add_ino_entry(sbi, ino, 0, ORPHAN_INO); @@ -589,6 +586,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } + err = dquot_initialize(inode); + if (err) + goto err_out; + + dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ @@ -598,14 +600,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x) by kernel, retry mount.", - __func__, ino); - return -EIO; + err = -EIO; + goto err_out; } __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; } int recover_orphan_inodes(struct f2fs_sb_info *sbi) @@ -1136,6 +1142,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); @@ -1162,6 +1170,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + /* + * pagevec_lookup_tag and lock_page again will take + * some extra time. Therefore, update_meta_pages and + * sync_meta_pages are combined in this function. + */ + struct page *page = grab_meta_page(sbi, blk_addr); + int err; + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + + f2fs_wait_on_page_writeback(page, META, true); + f2fs_bug_on(sbi, PageWriteback(page)); + if (unlikely(!clear_page_dirty_for_io(page))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); + f2fs_bug_on(sbi, err); + + f2fs_put_page(page, 0); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1264,16 +1305,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } } - /* need to wait for end_io results */ - wait_on_all_pages_writeback(sbi); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - - /* flush all device cache */ - err = f2fs_flush_device_cache(sbi); - if (err) - return err; - /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1301,26 +1332,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += NR_CURSEG_NODE_TYPE; } - /* writeout checkpoint block */ - update_meta_page(sbi, ckpt, start_blk); + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); + + /* Here, we have one bio having CP pack except cp pack 2 page */ + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - /* wait for previous submitted node/meta pages writeback */ + /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); - - /* update user_block_counts */ - sbi->last_valid_block_count = sbi->total_valid_block_count; - percpu_counter_set(&sbi->alloc_valid_block_count, 0); - - /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; - /* wait for previous submitted meta pages writeback */ + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); release_ino_entry(sbi, false); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7578ed1a85e0..db50686f5096 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -175,15 +175,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, struct writeback_control *wbc, - int npages, bool is_read) + int npages, bool is_read, + enum page_type type, enum temp_type temp) { struct bio *bio; bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); - bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = is_read ? NULL : sbi; + if (is_read) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + } if (wbc) wbc_init_bio(wbc, bio); @@ -196,13 +203,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (!is_read_io(bio_op(bio))) { unsigned int start; - if (f2fs_sb_mounted_blkzoned(sbi->sb) && - current->plug && (type == DATA || type == NODE)) - blk_finish_plug(current->plug); - if (type != DATA && type != NODE) goto submit_io; + if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) + blk_finish_plug(current->plug); + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; start %= F2FS_IO_SIZE(sbi); @@ -377,12 +383,13 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + verify_block_addr(fio, fio->new_blkaddr); trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - 1, is_read_io(fio->op)); + 1, is_read_io(fio->op), fio->type, fio->temp); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -422,8 +429,8 @@ next: } if (fio->old_blkaddr != NEW_ADDR) - verify_block_addr(sbi, fio->old_blkaddr); - verify_block_addr(sbi, fio->new_blkaddr); + verify_block_addr(fio, fio->old_blkaddr); + verify_block_addr(fio, fio->new_blkaddr); bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; @@ -445,7 +452,8 @@ alloc_new: goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false); + BIO_MAX_PAGES, false, + fio->type, fio->temp); io->fio = *fio; } @@ -832,13 +840,6 @@ alloc: return 0; } -static inline bool __force_buffered_io(struct inode *inode, int rw) -{ - return (f2fs_encrypted_file(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); -} - int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -870,7 +871,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (direct_io) { map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); - flag = __force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@ -1114,6 +1115,31 @@ out: return err; } +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, pgoff_t *next_pgofs, int seg_type) @@ -2287,25 +2313,41 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; + enum rw_hint hint = iocb->ki_hint; + int whint_mode = F2FS_OPTION(sbi).whint_mode; err = check_direct_IO(inode, iter, offset); if (err) return err; - if (__force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + if (rw == WRITE && whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + + if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (iocb->ki_flags & IOCB_NOWAIT) { + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + down_read(&F2FS_I(inode)->dio_rwsem[rw]); + } + err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); @@ -2315,6 +2357,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } } +out: trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f00b5ed8c011..fe661274ff10 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,14 +94,12 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; - dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; - else - kunmap(dentry_page); return de; } @@ -287,7 +285,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, *page); f2fs_put_page(*page, 0); } @@ -302,7 +299,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); - f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); @@ -350,13 +346,11 @@ static int make_empty_dir(struct inode *inode, if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); - kunmap_atomic(dentry_blk); - set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; @@ -367,6 +361,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, struct page *dpage) { struct page *page; + int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -393,7 +388,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { + if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + f2fs_may_encrypt(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; @@ -402,8 +398,6 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; - - set_cold_node(inode, page); } if (new_name) { @@ -547,13 +541,12 @@ start: if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } @@ -588,7 +581,6 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -642,7 +634,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, F2FS_I(dir)->task = NULL; } if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); err = -EEXIST; } else if (IS_ERR(page)) { @@ -713,7 +704,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -730,7 +722,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); - kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -775,7 +766,7 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -783,7 +774,6 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); @@ -901,19 +891,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } } - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; } - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } out_free: diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ff2352a0ed15..d5a861bf2b42 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -460,7 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, struct rb_node *insert_parent) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct rb_node **p = &et->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -706,6 +706,9 @@ void f2fs_drop_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; + if (!f2fs_may_extent_tree(inode)) + return; + set_inode_flag(inode, FI_NO_EXTENT); write_lock(&et->lock); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6300ac5bcbe4..1df7f10476d6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -98,9 +98,10 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 -#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -113,7 +114,26 @@ typedef u32 block_t; /* typedef u32 nid_t; struct f2fs_mount_info { - unsigned int opt; + unsigned int opt; + int write_io_size_bits; /* Write IO size bits */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int whint_mode; + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ + bool test_dummy_encryption; /* test dummy encryption */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -125,6 +145,8 @@ struct f2fs_mount_info { #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 +#define F2FS_FEATURE_LOST_FOUND 0x0200 +#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -450,7 +472,7 @@ static inline void make_dentry_ptr_block(struct inode *inode, d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; - d->bitmap = &t->dentry_bitmap; + d->bitmap = t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } @@ -576,6 +598,8 @@ enum { #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 /* reserved */ #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -590,6 +614,9 @@ enum { #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) #define DEF_DIR_LEVEL 0 @@ -637,6 +664,7 @@ struct f2fs_inode_info { kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec i_crtime; /* inode creation time */ + struct timespec i_disk_time[4]; /* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -743,7 +771,7 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ - unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char **free_nid_bitmap; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ @@ -976,6 +1004,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + bool is_meta; /* indicate borrow meta inode mapping or not */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; @@ -1037,10 +1066,34 @@ enum { MAX_TIME, }; +enum { + WHINT_MODE_OFF, /* not pass down write hints */ + WHINT_MODE_USER, /* try to pass down hints given by users */ + WHINT_MODE_FS, /* pass down hints with F2FS policy */ +}; + +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) \ + (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ @@ -1060,7 +1113,6 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ - int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ @@ -1110,9 +1162,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ - int active_logs; /* # of active logs */ int dir_level; /* directory level */ - int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ @@ -1122,9 +1172,6 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ - block_t root_reserved_blocks; /* root reserved blocks */ - kuid_t s_resuid; /* reserved blocks for uid */ - kgid_t s_resgid; /* reserved blocks for gid */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1209,17 +1256,6 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; - - /* For fault injection */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info fault_info; -#endif - -#ifdef CONFIG_QUOTA - /* Names of quota files with journalled quota */ - char *s_qf_names[MAXQUOTAS]; - int s_jquota_fmt; /* Format of quota to use */ -#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1229,7 +1265,7 @@ struct f2fs_sb_info { __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (!ffi->inject_rate) return false; @@ -1586,12 +1622,12 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; if (IS_NOQUOTA(inode)) return true; - if (capable(CAP_SYS_RESOURCE)) + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) return true; - if (uid_eq(sbi->s_resuid, current_fsuid())) + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; - if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && - in_group_p(sbi->s_resgid)) + if (capable(CAP_SYS_RESOURCE)) return true; return false; } @@ -1627,7 +1663,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks; if (!__allow_reserved_blocks(sbi, inode)) - avail_user_block_count -= sbi->root_reserved_blocks; + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; @@ -1762,6 +1798,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); int offset; + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } + if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; @@ -1828,7 +1870,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks + 1; if (!__allow_reserved_blocks(sbi, inode)) - valid_block_count += sbi->root_reserved_blocks; + valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); @@ -2399,12 +2441,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DENTRY); } -static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) -{ - if (!f2fs_has_inline_dentry(dir)) - kunmap(page); -} - static inline int is_file(struct inode *inode, int type) { return F2FS_I(inode)->i_advise & type; @@ -2436,7 +2472,17 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || file_keep_isize(inode) || - i_size_read(inode) & PAGE_MASK) + i_size_read(inode) & ~PAGE_MASK) + return false; + + if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) return false; down_read(&F2FS_I(inode)->i_sem); @@ -2446,9 +2492,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) return ret; } -static inline int f2fs_readonly(struct super_block *sb) +static inline bool f2fs_readonly(struct super_block *sb) { - return sb->s_flags & SB_RDONLY; + return sb_rdonly(sb); } static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) @@ -2596,6 +2642,8 @@ void handle_failed_inode(struct inode *inode); /* * namei.c */ +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* @@ -2768,6 +2816,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); int rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, + enum temp_type temp); /* * checkpoint.c @@ -2850,6 +2900,7 @@ int f2fs_release_page(struct page *page, gfp_t wait); int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); /* * gc.c @@ -3172,45 +3223,21 @@ static inline bool f2fs_bio_encrypted(struct bio *bio) return bio->bi_private != NULL; } -static inline int f2fs_sb_has_crypto(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); -} - -static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); -} - -static inline int f2fs_sb_has_extra_attr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); -} - -static inline int f2fs_sb_has_project_quota(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); -} - -static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); -} - -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); -} - -static inline int f2fs_sb_has_quota_ino(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline int f2fs_sb_has_##name(struct super_block *sb) \ +{ \ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ } -static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); -} +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, @@ -3230,7 +3257,7 @@ static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); + return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -3259,4 +3286,11 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + return (f2fs_encrypted_file(inode) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 672a542e5464..6b94f19b3fa8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -163,9 +163,10 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; - else if (sbi->active_logs == 2) + else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; - else if (need_dentry_mark(sbi, inode->i_ino) && + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && + need_dentry_mark(sbi, inode->i_ino) && exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; @@ -479,6 +480,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + + filp->f_mode |= FMODE_NOWAIT; + return dquot_file_open(inode, filp); } @@ -569,7 +573,6 @@ truncate_out: int truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; int count = 0, err = 0; @@ -578,7 +581,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); - free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= sbi->max_file_blocks) goto free_partial; @@ -1348,8 +1351,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } out_sem: up_write(&F2FS_I(inode)->i_mmap_sem); @@ -1711,6 +1718,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (f2fs_is_volatile_file(inode)) goto err_out; @@ -1729,6 +1738,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1938,7 +1948,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -1948,7 +1958,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -1959,16 +1969,18 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; - if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) - goto got_it; - err = mnt_want_write_file(filp); if (err) return err; + down_write(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + /* update superblock with uuid */ generate_random_uuid(sbi->raw_super->encrypt_pw_salt); @@ -1976,15 +1988,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); - mnt_drop_write_file(filp); - return err; + goto out_err; } - mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) - return -EFAULT; - return 0; + err = -EFAULT; +out_err: + up_write(&sbi->sb_lock); + mnt_drop_write_file(filp); + return err; } static int f2fs_ioc_gc(struct file *filp, unsigned long arg) @@ -2045,8 +2058,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) - return -EINVAL; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + ret = -EINVAL; + goto out; + } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { @@ -2885,25 +2900,54 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = generic_write_checks(iocb, from); if (ret > 0) { + bool preallocated = false; + size_t target_size = 0; int err; if (iov_iter_fault_in_readable(from, iov_iter_count(from))) set_inode_flag(inode, FI_NO_PREALLOC); - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - return err; + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || + f2fs_has_inline_data(inode) || + f2fs_force_buffered_io(inode, WRITE)) { + inode_unlock(inode); + return -EAGAIN; + } + + } else { + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + return err; + } } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + /* if we couldn't write data, we should deallocate blocks. */ + if (preallocated && i_size_read(inode) < target_size) + f2fs_truncate(inode); + if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index aa720cc44509..bfb7a4a3a929 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,14 +76,15 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (!mutex_trylock(&sbi->gc_mutex)) - goto next; - if (gc_th->gc_urgent) { wait_ms = gc_th->urgent_sleep_time; + mutex_lock(&sbi->gc_mutex); goto do_gc; } + if (!mutex_trylock(&sbi->gc_mutex)) + goto next; + if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); @@ -161,12 +162,17 @@ static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (gc_th && gc_th->gc_idle) { + if (!gc_th) + return gc_mode; + + if (gc_th->gc_idle) { if (gc_th->gc_idle == 1) gc_mode = GC_CB; else if (gc_th->gc_idle == 2) gc_mode = GC_GREEDY; } + if (gc_th->gc_urgent) + gc_mode = GC_GREEDY; return gc_mode; } @@ -188,11 +194,14 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, } /* we need to check every dirty segments in the FG_GC case */ - if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) + if (gc_type != FG_GC && + (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - /* let's select beginning hot/small space first */ - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + /* let's select beginning hot/small space first in no_heap mode*/ + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 90e38d8ea688..3b77d6421218 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -369,7 +369,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, f2fs_wait_on_page_writeback(page, DATA, true); zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); - dentry_blk = kmap_atomic(page); + dentry_blk = page_address(page); make_dentry_ptr_inline(dir, &src, inline_dentry); make_dentry_ptr_block(dir, &dst, dentry_blk); @@ -386,7 +386,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); - kunmap_atomic(dentry_blk); if (!PageUptodate(page)) SetPageUptodate(page); set_page_dirty(page); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 205add3d0f3a..e0d9e8f27ed2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -284,6 +284,10 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -328,7 +332,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); } else if (S_ISLNK(inode->i_mode)) { if (f2fs_encrypted_inode(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; @@ -439,12 +443,15 @@ void update_inode(struct inode *inode, struct page *node_page) } __set_inode_rdev(inode, ri); - set_cold_node(inode, node_page); /* deleted inode */ if (inode->i_nlink == 0) clear_inline_node(node_page); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } void update_inode_page(struct inode *inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b68e7b03959f..d5098efe577c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -78,7 +78,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) + if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi->sb)) { @@ -97,7 +98,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); if (f2fs_has_inline_xattr(inode)) - xattr_size = sbi->inline_xattr_size; + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; /* Otherwise, will be 0 */ } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -142,7 +143,7 @@ fail_drop: return ERR_PTR(err); } -static int is_multimedia_file(const unsigned char *s, const char *sub) +static int is_extension_exist(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -168,19 +169,94 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { - int i; - __u8 (*extlist)[8] = sbi->raw_super->extension_list; + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; - int count = le32_to_cpu(sbi->raw_super->extension_count); - for (i = 0; i < count; i++) { - if (is_multimedia_file(name, extlist[i])) { + for (i = 0; i < cold_count + hot_count; i++) { + if (!is_extension_exist(name, extlist[i])) + continue; + if (i < cold_count) file_set_cold(inode); - break; - } + else + file_set_hot(inode); + break; } + + up_read(&sbi->sb_lock); +} + +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + strncpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + strncpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; } static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -203,7 +279,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_files(sbi, inode, dentry->d_name.name); + set_file_temperature(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -317,7 +393,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) de = f2fs_find_entry(dir, &dot, &page); if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); } else if (IS_ERR(page)) { err = PTR_ERR(page); @@ -329,14 +404,12 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) } de = f2fs_find_entry(dir, &dotdot, &page); - if (de) { - f2fs_dentry_kunmap(dir, page); + if (de) f2fs_put_page(page, 0); - } else if (IS_ERR(page)) { + else if (IS_ERR(page)) err = PTR_ERR(page); - } else { + else err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); - } out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -377,7 +450,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } ino = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); @@ -452,7 +524,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); goto fail; } @@ -579,7 +650,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); @@ -717,10 +788,12 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); if (err) return err; @@ -893,16 +966,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (old_dir_entry) { - if (old_dir != new_dir && !whiteout) { + if (old_dir != new_dir && !whiteout) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - } else { - f2fs_dentry_kunmap(old_inode, old_dir_page); + else f2fs_put_page(old_dir_page, 0); - } f2fs_i_links_write(old_dir, false); } - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -912,20 +984,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) { - f2fs_dentry_kunmap(new_dir, new_page); + if (new_page) f2fs_put_page(new_page, 0); - } out_whiteout: if (whiteout) iput(whiteout); out_dir: - if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); + if (old_dir_entry) f2fs_put_page(old_dir_page, 0); - } out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; @@ -1057,8 +1124,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } f2fs_unlock_op(sbi); @@ -1067,19 +1136,15 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; out_new_dir: if (new_dir_entry) { - f2fs_dentry_kunmap(new_inode, new_dir_page); f2fs_put_page(new_dir_page, 0); } out_old_dir: if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } out_new: - f2fs_dentry_kunmap(new_dir, new_page); f2fs_put_page(new_page, 0); out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 177c438e4a56..9a99243054ba 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -193,8 +193,8 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) __free_nat_entry(e); } -static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) { nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; @@ -209,15 +209,36 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } + return head; +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; nm_i->dirty_nat_cnt++; - head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: - if (nat_get_blkaddr(ne) == NEW_ADDR) + if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); @@ -1076,7 +1097,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(dn->inode, page); + set_cold_node(page, S_ISDIR(dn->inode->i_mode)); if (!PageUptodate(page)) SetPageUptodate(page); if (set_page_dirty(page)) @@ -2291,6 +2312,7 @@ retry: if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); + set_cold_node(page, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); @@ -2580,8 +2602,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) if (!enabled_nat_bits(sbi, NULL)) return 0; - nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + - F2FS_BLKSIZE - 1); + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) @@ -2707,12 +2728,20 @@ static int init_node_manager(struct f2fs_sb_info *sbi) static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; - nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * + sizeof(unsigned char *), GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + } + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) @@ -2803,7 +2832,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); - kvfree(nm_i->free_nid_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kfree(nm_i->free_nid_bitmap); + } kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 081ef0d672bf..b95e49e4a928 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -44,6 +44,7 @@ enum { HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ }; /* @@ -422,12 +423,12 @@ static inline void clear_inline_node(struct page *page) ClearPageChecked(page); } -static inline void set_cold_node(struct inode *inode, struct page *page) +static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); - if (S_ISDIR(inode->i_mode)) + if (is_dir) flag &= ~(0x1 << COLD_BIT_SHIFT); else flag |= (0x1 << COLD_BIT_SHIFT); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 337f3363f48f..1b23d3febe4c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -144,7 +144,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, retry: de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) - goto out_unmap_put; + goto out_put; if (de) { einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); @@ -153,19 +153,19 @@ retry: err = PTR_ERR(einode); if (err == -ENOENT) err = -EEXIST; - goto out_unmap_put; + goto out_put; } err = dquot_initialize(einode); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } f2fs_delete_entry(de, page, dir, einode); iput(einode); @@ -180,8 +180,7 @@ retry: goto retry; goto out; -out_unmap_put: - f2fs_dentry_kunmap(dir, page); +out_put: f2fs_put_page(page, 0); out: if (file_enc_name(inode)) @@ -243,6 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; + unsigned int loop_cnt = 0; + unsigned int free_blocks = sbi->user_block_count - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ @@ -295,6 +297,17 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (IS_INODE(page) && is_dent_dnode(page)) entry->last_dentry = blkaddr; next: + /* sanity check in order to detect looped node chain */ + if (++loop_cnt >= free_blocks || + blkaddr == next_blkaddr_of_node(page)) { + f2fs_msg(sbi->sb, KERN_NOTICE, + "%s: detect looped node chain, " + "blkaddr:%u, next:%u", + __func__, blkaddr, next_blkaddr_of_node(page)); + err = -EINVAL; + break; + } + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b16a8e6625aa..5854cc4e1d67 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1411,12 +1411,11 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - if (dcc->discard_wake) { + if (dcc->discard_wake) dcc->discard_wake = 0; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, - DPOLICY_FORCE, 1); - } + + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1485,7 +1484,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb) && + if (f2fs_sb_has_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif @@ -1683,7 +1682,7 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (f2fs_sb_mounted_blkzoned(sbi->sb) || + if (f2fs_sb_has_blkzoned(sbi->sb) || (force && len < cpc->trim_minlen)) goto skip; @@ -1727,7 +1726,7 @@ void init_discard_policy(struct discard_policy *dpolicy, } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; + dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { @@ -1863,7 +1862,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ - if (se->type == CURSEG_WARM_NODE) { + if (IS_NODESEG(se->type)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } @@ -2164,11 +2163,17 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (sbi->segs_per_sec != 1) return CURSEG_I(sbi, type)->segno; - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + return 0; + return CURSEG_I(sbi, type)->segno; } @@ -2455,6 +2460,101 @@ int rw_hint_to_seg_type(enum rw_hint hint) } } +/* This returns write hints for each segment type. This hints will be + * passed down to block layer. There are mapping tables which depend on + * the mount option 'whint_mode'. + * + * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + * + * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NOT_SET + * HOT_NODE " + * WARM_NODE " + * COLD_NODE " + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + * + * 3) whint_mode=fs-based. F2FS passes down hints with its policy. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_MEDIUM; + * HOT_NODE WRITE_LIFE_NOT_SET + * WARM_NODE " + * COLD_NODE WRITE_LIFE_NONE + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + */ + +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_NOT_SET; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else { + return WRITE_LIFE_NOT_SET; + } + } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_LONG; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else if (type == NODE) { + if (temp == WARM || temp == HOT) + return WRITE_LIFE_NOT_SET; + else if (temp == COLD) + return WRITE_LIFE_NONE; + } else if (type == META) { + return WRITE_LIFE_MEDIUM; + } + } + return WRITE_LIFE_NOT_SET; +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2487,7 +2587,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - if (is_inode_flag_set(inode, FI_HOT_DATA)) + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; return rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -2502,7 +2603,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) { int type = 0; - switch (fio->sbi->active_logs) { + switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; @@ -2642,6 +2743,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = META, + .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, @@ -2688,8 +2790,15 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) int rewrite_data_page(struct f2fs_io_info *fio) { int err; + struct f2fs_sb_info *sbi = fio->sbi; fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); + + f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi, + GET_SEGNO(sbi, fio->new_blkaddr))->type)); + stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f11c4bc82c78..3325d0769723 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -53,13 +53,19 @@ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec)) \ -#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) -#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) @@ -596,6 +602,8 @@ static inline int utilization(struct f2fs_sb_info *sbi) #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, @@ -630,10 +638,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + struct f2fs_sb_info *sbi = fio->sbi; + + if (PAGE_TYPE_OF_BIO(fio->type) == META && + (!is_read_io(fio->op) || fio->is_meta)) + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || + blk_addr >= MAIN_BLKADDR(sbi)); + else + BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || + blk_addr >= MAX_BLKADDR(sbi)); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8173ae688814..42d564c5ccd0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -60,7 +60,7 @@ char *fault_name[FAULT_MAX] = { static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); @@ -129,6 +129,10 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, + Opt_whint, + Opt_alloc, + Opt_fsync, + Opt_test_dummy_encryption, Opt_err, }; @@ -182,6 +186,10 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_whint, "whint_mode=%s"}, + {Opt_alloc, "alloc_mode=%s"}, + {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_err, NULL}, }; @@ -202,21 +210,24 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) block_t limit = (sbi->user_block_count << 1) / 1000; /* limit is 0.2% */ - if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { - sbi->root_reserved_blocks = limit; + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > limit) { + F2FS_OPTION(sbi).root_reserved_blocks = limit; f2fs_msg(sbi->sb, KERN_INFO, "Reduce reserved blocks for root = %u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } if (!test_opt(sbi, RESERVE_ROOT) && - (!uid_eq(sbi->s_resuid, + (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || - !gid_eq(sbi->s_resgid, + !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) f2fs_msg(sbi->sb, KERN_INFO, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); } static void init_once(void *foo) @@ -236,7 +247,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, char *qname; int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -254,8 +265,8 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "Not enough memory for storing quotafile name"); return -EINVAL; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (F2FS_OPTION(sbi).s_qf_names[qtype]) { + if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) ret = 0; else f2fs_msg(sb, KERN_ERR, @@ -268,7 +279,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + F2FS_OPTION(sbi).s_qf_names[qtype] = qname; set_opt(sbi, QUOTA); return 0; errout: @@ -280,13 +291,13 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -EINVAL; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -302,15 +313,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "Cannot enable project quota enforcement."); return -1; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || - sbi->s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) clear_opt(sbi, USRQUOTA); - if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sbi, GRPQUOTA) && + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) clear_opt(sbi, GRPQUOTA); - if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + if (test_opt(sbi, PRJQUOTA) && + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) clear_opt(sbi, PRJQUOTA); if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || @@ -320,19 +335,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } - if (!sbi->s_jquota_fmt) { + if (!F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " "not specified"); return -1; } } - if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_INFO, "QUOTA feature is enabled, so ignore jquota_fmt"); - sbi->s_jquota_fmt = 0; + F2FS_OPTION(sbi).s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); @@ -403,14 +418,14 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else if (!f2fs_sb_mounted_blkzoned(sb)) { + } else if (!f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "discard is required for zoned block devices"); return -EINVAL; @@ -440,7 +455,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; set_opt(sbi, INLINE_XATTR_SIZE); - sbi->inline_xattr_size = arg; + F2FS_OPTION(sbi).inline_xattr_size = arg; break; #else case Opt_user_xattr: @@ -480,7 +495,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) return -EINVAL; - sbi->active_logs = arg; + F2FS_OPTION(sbi).active_logs = arg; break; case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); @@ -524,9 +539,9 @@ static int parse_options(struct super_block *sb, char *options) if (test_opt(sbi, RESERVE_ROOT)) { f2fs_msg(sb, KERN_INFO, "Preserve previous reserve_root=%u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } else { - sbi->root_reserved_blocks = arg; + F2FS_OPTION(sbi).root_reserved_blocks = arg; set_opt(sbi, RESERVE_ROOT); } break; @@ -539,7 +554,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid uid value %d", arg); return -EINVAL; } - sbi->s_resuid = uid; + F2FS_OPTION(sbi).s_resuid = uid; break; case Opt_resgid: if (args->from && match_int(args, &arg)) @@ -550,7 +565,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid gid value %d", arg); return -EINVAL; } - sbi->s_resgid = gid; + F2FS_OPTION(sbi).s_resgid = gid; break; case Opt_mode: name = match_strdup(&args[0]); @@ -559,7 +574,7 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "adaptive mode is not allowed with " "zoned block device feature"); @@ -585,7 +600,7 @@ static int parse_options(struct super_block *sb, char *options) 1 << arg, BIO_MAX_PAGES); return -EINVAL; } - sbi->write_io_size_bits = arg; + F2FS_OPTION(sbi).write_io_size_bits = arg; break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) @@ -646,13 +661,13 @@ static int parse_options(struct super_block *sb, char *options) return ret; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; break; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; break; case Opt_jqfmt_vfsv1: - sbi->s_jquota_fmt = QFMT_VFS_V1; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; break; case Opt_noquota: clear_opt(sbi, QUOTA); @@ -679,6 +694,73 @@ static int parse_options(struct super_block *sb, char *options) "quota operations not supported"); break; #endif + case Opt_whint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 10 && + !strncmp(name, "user-based", 10)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; + } else if (strlen(name) == 3 && + !strncmp(name, "off", 3)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + } else if (strlen(name) == 8 && + !strncmp(name, "fs-based", 8)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_alloc: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 7 && + !strncmp(name, "default", 7)) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + } else if (strlen(name) == 5 && + !strncmp(name, "reuse", 5)) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fsync: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 5 && + !strncmp(name, "posix", 5)) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + } else if (strlen(name) == 6 && + !strncmp(name, "strict", 6)) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (!f2fs_sb_has_encrypt(sb)) { + f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); + return -EINVAL; + } + + F2FS_OPTION(sbi).test_dummy_encryption = true; + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mode enabled"); +#else + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mount option ignored"); +#endif + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -699,14 +781,22 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!f2fs_sb_has_extra_attr(sb) || + !f2fs_sb_has_flexible_inline_xattr(sb)) { + f2fs_msg(sb, KERN_ERR, + "extra_attr or flexible_inline_xattr " + "feature is off"); + return -EINVAL; + } if (!test_opt(sbi, INLINE_XATTR)) { f2fs_msg(sb, KERN_ERR, "inline_xattr_size option should be " "set with inline_xattr option"); return -EINVAL; } - if (!sbi->inline_xattr_size || - sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + if (!F2FS_OPTION(sbi).inline_xattr_size || + F2FS_OPTION(sbi).inline_xattr_size >= + DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) { @@ -715,6 +805,12 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + /* Not pass down write hints if the number of active logs is lesser + * than NR_CURSEG_TYPE. + */ + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; return 0; } @@ -731,7 +827,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; - fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); @@ -743,10 +838,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); -#ifdef CONFIG_QUOTA - memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); - fi->i_reserved_quota = 0; -#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -956,7 +1047,7 @@ static void f2fs_put_super(struct super_block *sb) mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) @@ -1070,8 +1161,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - if (buf->f_bfree > sbi->root_reserved_blocks) - buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; else buf->f_bavail = 0; @@ -1106,10 +1198,10 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #ifdef CONFIG_QUOTA struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sbi->s_jquota_fmt) { + if (F2FS_OPTION(sbi).s_jquota_fmt) { char *fmtname = ""; - switch (sbi->s_jquota_fmt) { + switch (F2FS_OPTION(sbi).s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; @@ -1123,14 +1215,17 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); - if (sbi->s_qf_names[PRJQUOTA]) - seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); #endif } @@ -1165,7 +1260,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_xattr"); if (test_opt(sbi, INLINE_XATTR_SIZE)) seq_printf(seq, ",inline_xattr_size=%u", - sbi->inline_xattr_size); + F2FS_OPTION(sbi).inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1201,18 +1296,20 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); - seq_printf(seq, ",active_logs=%u", sbi->active_logs); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", - sbi->root_reserved_blocks, - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + F2FS_OPTION(sbi).root_reserved_blocks, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) seq_printf(seq, ",fault_injection=%u", - sbi->fault_info.inject_rate); + F2FS_OPTION(sbi).fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1225,15 +1322,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) + seq_printf(seq, ",whint_mode=%s", "user-based"); + else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) + seq_printf(seq, ",whint_mode=%s", "fs-based"); +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_OPTION(sbi).test_dummy_encryption) + seq_puts(seq, ",test_dummy_encryption"); +#endif + + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); return 0; } static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; - sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).test_dummy_encryption = false; + sbi->readdir_ra = 1; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1243,7 +1362,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_blkzoned(sbi->sb)) { + if (f2fs_sb_has_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -1270,16 +1389,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; - int err, active_logs; + int err; bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info ffi = sbi->fault_info; -#endif #ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; int i, j; #endif @@ -1289,21 +1403,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; - active_logs = sbi->active_logs; #ifdef CONFIG_QUOTA - s_jquota_fmt = sbi->s_jquota_fmt; + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { - s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); - if (!s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kfree(s_qf_names[j]); + kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { - s_qf_names[i] = NULL; + org_mount_opt.s_qf_names[i] = NULL; } } #endif @@ -1373,7 +1487,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY) { + if (*flags & SB_RDONLY || + F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1399,7 +1514,7 @@ skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kfree(s_qf_names[i]); + kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -1417,18 +1532,14 @@ restore_gc: } restore_opts: #ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = s_jquota_fmt; + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = s_qf_names[i]; + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif sbi->mount_opt = org_mount_opt; - sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; -#ifdef CONFIG_F2FS_FAULT_INJECTION - sbi->fault_info = ffi; -#endif return err; } @@ -1456,7 +1567,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: - page = read_mapping_page(mapping, blkidx, NULL); + page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -1550,8 +1661,8 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { - return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], - sbi->s_jquota_fmt, type); + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); } int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) @@ -1570,7 +1681,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) } for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { err = f2fs_quota_on_mount(sbi, i); if (!err) { enabled = 1; @@ -1797,11 +1908,28 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi->sb) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, fs_data, XATTR_CREATE); } +static bool f2fs_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); +} + static unsigned f2fs_max_namelen(struct inode *inode) { return S_ISLNK(inode->i_mode) ? @@ -1812,6 +1940,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, + .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; @@ -1894,7 +2023,6 @@ static int __f2fs_commit_super(struct buffer_head *bh, lock_buffer(bh); if (super) memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); - set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); @@ -2181,6 +2309,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); + + init_rwsem(&sbi->sb_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2206,7 +2336,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) unsigned int n = 0; int err = -EIO; - if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + if (!f2fs_sb_has_blkzoned(sbi->sb)) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != @@ -2334,7 +2464,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) } /* write back-up superblock first */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); @@ -2345,7 +2475,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; /* write current valid superblock */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block); + bh = sb_bread(sbi->sb, sbi->valid_super_block); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); @@ -2413,7 +2543,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sbi->sb)) { + !f2fs_sb_has_blkzoned(sbi->sb)) { f2fs_msg(sbi->sb, KERN_ERR, "Zoned block device feature not enabled\n"); return -EINVAL; @@ -2447,6 +2577,18 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) return 0; } +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + + /* adjust parameters according to the volume size */ + if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + } +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -2494,8 +2636,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; - sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) @@ -2508,7 +2650,7 @@ try_onemore: * devices, but mandatory for host-managed zoned block devices. */ #ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); err = -EOPNOTSUPP; @@ -2724,7 +2866,7 @@ try_onemore: * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { f2fs_msg(sb, KERN_ERR, @@ -2799,6 +2941,8 @@ skip_recovery: f2fs_join_shrinker(sbi); + f2fs_tuning_parameters(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); @@ -2807,7 +2951,7 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif f2fs_sync_inode_meta(sbi); @@ -2851,7 +2995,7 @@ free_bio_info: free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif kfree(options); free_sb_buf: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d978c7b6ea04..f33a56d6e6dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -58,7 +58,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif return NULL; } @@ -92,10 +92,10 @@ static ssize_t features_show(struct f2fs_attr *a, if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); - if (f2fs_sb_has_crypto(sb)) + if (f2fs_sb_has_encrypt(sb)) len += snprintf(buf, PAGE_SIZE - len, "%s", "encryption"); - if (f2fs_sb_mounted_blkzoned(sb)) + if (f2fs_sb_has_blkzoned(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sb)) @@ -116,6 +116,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_crtime(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "lost_found"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -136,6 +139,27 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int len = 0, i; + + len += snprintf(buf + len, PAGE_SIZE - len, + "cold file extenstion:\n"); + for (i = 0; i < cold_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + + len += snprintf(buf + len, PAGE_SIZE - len, + "hot file extenstion:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); @@ -154,6 +178,41 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true, hot; + + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { + name++; + set = false; + } + + if (strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + down_write(&sbi->sb_lock); + + ret = update_extension_list(sbi, name, hot, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + update_extension_list(sbi, name, hot, !set); +out: + up_write(&sbi->sb_lock); + return ret ? ret : count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -166,7 +225,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - sbi->root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -236,6 +295,7 @@ enum feat_id { FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, FEAT_INODE_CRTIME, + FEAT_LOST_FOUND, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -251,6 +311,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: + case FEAT_LOST_FOUND: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -307,6 +368,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -329,6 +391,7 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); +F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -357,6 +420,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), @@ -383,6 +447,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), + ATTR_LIST(lost_found), NULL, }; diff --git a/fs/fcntl.c b/fs/fcntl.c index 1e97f1fda90c..d737ff082472 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -607,8 +607,8 @@ static int fixup_compat_flock(struct flock *flock) return 0; } -COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) +static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, + compat_ulong_t arg) { struct fd f = fdget_raw(fd); struct flock flock; @@ -672,6 +672,12 @@ out_put: return err; } +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + return do_compat_fcntl64(fd, cmd, arg); +} + COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { @@ -684,7 +690,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, case F_OFD_SETLKW: return -EINVAL; } - return compat_sys_fcntl64(fd, cmd, arg); + return do_compat_fcntl64(fd, cmd, arg); } #endif diff --git a/fs/file.c b/fs/file.c index 42f0db4bd0fb..7ffd6e9d103d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -638,6 +638,7 @@ out_unlock: spin_unlock(&files->file_lock); return -EBADF; } +EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ void do_close_on_exec(struct files_struct *files) { @@ -870,7 +871,7 @@ out_unlock: return err; } -SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; @@ -904,6 +905,11 @@ out_unlock: return err; } +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ + return ksys_dup3(oldfd, newfd, flags); +} + SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ @@ -916,10 +922,10 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) rcu_read_unlock(); return retval; } - return sys_dup3(oldfd, newfd, 0); + return ksys_dup3(oldfd, newfd, 0); } -SYSCALL_DEFINE1(dup, unsigned int, fildes) +int ksys_dup(unsigned int fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); @@ -934,6 +940,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) return ret; } +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + return ksys_dup(fildes); +} + int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d4d04fee568a..1280f915079b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1343,7 +1343,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { - if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + if ((dirty & I_DIRTY_INODE) || wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, @@ -2112,7 +2112,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) */ void __mark_inode_dirty(struct inode *inode, int flags) { -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) struct super_block *sb = inode->i_sb; int dirtytime; @@ -2122,7 +2121,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { + if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) { trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) @@ -2197,7 +2196,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (dirtytime) inode->dirtied_time_when = jiffies; - if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) + if (inode->i_state & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; @@ -2221,8 +2220,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) } out_unlock_inode: spin_unlock(&inode->i_lock); - -#undef I_DIRTY_INODE } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 56cce7fdd39e..c184c5a356ff 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -125,7 +125,7 @@ struct fscache_cache *fscache_select_cache_for_object( } /* the parent is unbacked */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { /* cookie not an index and is unbacked */ spin_unlock(&cookie->lock); _leave(" = NULL [cookie ub,ni]"); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index ff84258132bb..7dc55b93a830 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -21,12 +21,54 @@ struct kmem_cache *fscache_cookie_jar; static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); +#define fscache_cookie_hash_shift 15 +static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift]; + +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, + loff_t object_size); static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_cookie *cookie); static int fscache_attach_object(struct fscache_cookie *cookie, struct fscache_object *object); +static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) +{ + struct hlist_node *object; + const u8 *k; + unsigned loop; + + pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n", + prefix, cookie, cookie->parent, cookie->flags, + atomic_read(&cookie->n_children), + atomic_read(&cookie->n_active)); + pr_err("%c-cookie d=%p n=%p\n", + prefix, cookie->def, cookie->netfs_data); + + object = READ_ONCE(cookie->backing_objects.first); + if (object) + pr_err("%c-cookie o=%p\n", + prefix, hlist_entry(object, struct fscache_object, cookie_link)); + + pr_err("%c-key=[%u] '", prefix, cookie->key_len); + k = (cookie->key_len <= sizeof(cookie->inline_key)) ? + cookie->inline_key : cookie->key; + for (loop = 0; loop < cookie->key_len; loop++) + pr_cont("%02x", k[loop]); + pr_cont("'\n"); +} + +void fscache_free_cookie(struct fscache_cookie *cookie) +{ + if (cookie) { + BUG_ON(!hlist_empty(&cookie->backing_objects)); + if (cookie->aux_len > sizeof(cookie->inline_aux)) + kfree(cookie->aux); + if (cookie->key_len > sizeof(cookie->inline_key)) + kfree(cookie->key); + kmem_cache_free(fscache_cookie_jar, cookie); + } +} + /* * initialise an cookie jar slab element prior to any use */ @@ -41,6 +83,170 @@ void fscache_cookie_init_once(void *_cookie) } /* + * Set the index key in a cookie. The cookie struct has space for a 12-byte + * key plus length and hash, but if that's not big enough, it's instead a + * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then + * the key data. + */ +static int fscache_set_key(struct fscache_cookie *cookie, + const void *index_key, size_t index_key_len) +{ + unsigned long long h; + u32 *buf; + int i; + + cookie->key_len = index_key_len; + + if (index_key_len > sizeof(cookie->inline_key)) { + buf = kzalloc(index_key_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + cookie->key = buf; + } else { + buf = (u32 *)cookie->inline_key; + buf[0] = 0; + buf[1] = 0; + buf[2] = 0; + } + + memcpy(buf, index_key, index_key_len); + + /* Calculate a hash and combine this with the length in the first word + * or first half word + */ + h = (unsigned long)cookie->parent; + h += index_key_len + cookie->type; + for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++) + h += buf[i]; + + cookie->key_hash = h ^ (h >> 32); + return 0; +} + +static long fscache_compare_cookie(const struct fscache_cookie *a, + const struct fscache_cookie *b) +{ + const void *ka, *kb; + + if (a->key_hash != b->key_hash) + return (long)a->key_hash - (long)b->key_hash; + if (a->parent != b->parent) + return (long)a->parent - (long)b->parent; + if (a->key_len != b->key_len) + return (long)a->key_len - (long)b->key_len; + if (a->type != b->type) + return (long)a->type - (long)b->type; + + if (a->key_len <= sizeof(a->inline_key)) { + ka = &a->inline_key; + kb = &b->inline_key; + } else { + ka = a->key; + kb = b->key; + } + return memcmp(ka, kb, a->key_len); +} + +/* + * Allocate a cookie. + */ +struct fscache_cookie *fscache_alloc_cookie( + struct fscache_cookie *parent, + const struct fscache_cookie_def *def, + const void *index_key, size_t index_key_len, + const void *aux_data, size_t aux_data_len, + void *netfs_data, + loff_t object_size) +{ + struct fscache_cookie *cookie; + + /* allocate and initialise a cookie */ + cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + if (!cookie) + return NULL; + + cookie->key_len = index_key_len; + cookie->aux_len = aux_data_len; + + if (fscache_set_key(cookie, index_key, index_key_len) < 0) + goto nomem; + + if (cookie->aux_len <= sizeof(cookie->inline_aux)) { + memcpy(cookie->inline_aux, aux_data, cookie->aux_len); + } else { + cookie->aux = kmemdup(aux_data, cookie->aux_len, GFP_KERNEL); + if (!cookie->aux) + goto nomem; + } + + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + + /* We keep the active count elevated until relinquishment to prevent an + * attempt to wake up every time the object operations queue quiesces. + */ + atomic_set(&cookie->n_active, 1); + + cookie->def = def; + cookie->parent = parent; + cookie->netfs_data = netfs_data; + cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); + cookie->type = def->type; + + /* radix tree insertion won't use the preallocation pool unless it's + * told it may not wait */ + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + return cookie; + +nomem: + fscache_free_cookie(cookie); + return NULL; +} + +/* + * Attempt to insert the new cookie into the hash. If there's a collision, we + * return the old cookie if it's not in use and an error otherwise. + */ +struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) +{ + struct fscache_cookie *cursor; + struct hlist_bl_head *h; + struct hlist_bl_node *p; + unsigned int bucket; + + bucket = candidate->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1); + h = &fscache_cookie_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_for_each_entry(cursor, p, h, hash_link) { + if (fscache_compare_cookie(candidate, cursor) == 0) + goto collision; + } + + __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags); + fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent); + atomic_inc(&candidate->parent->n_children); + hlist_bl_add_head(&candidate->hash_link, h); + hlist_bl_unlock(h); + return candidate; + +collision: + if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) { + trace_fscache_cookie(cursor, fscache_cookie_collision, + atomic_read(&cursor->usage)); + pr_err("Duplicate cookie detected\n"); + fscache_print_cookie(cursor, 'O'); + fscache_print_cookie(candidate, 'N'); + hlist_bl_unlock(h); + return NULL; + } + + fscache_cookie_get(cursor, fscache_cookie_get_reacquire); + hlist_bl_unlock(h); + return cursor; +} + +/* * request a cookie to represent an object (index, datafile, xattr, etc) * - parent specifies the parent object * - the top level index cookie for each netfs is stored in the fscache_netfs @@ -58,10 +264,13 @@ void fscache_cookie_init_once(void *_cookie) struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, + const void *index_key, size_t index_key_len, + const void *aux_data, size_t aux_data_len, void *netfs_data, + loff_t object_size, bool enable) { - struct fscache_cookie *cookie; + struct fscache_cookie *candidate, *cookie; BUG_ON(!def); @@ -69,6 +278,13 @@ struct fscache_cookie *__fscache_acquire_cookie( parent ? (char *) parent->def->name : "<no-parent>", def->name, netfs_data, enable); + if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) + return NULL; + if (!aux_data || !aux_data_len) { + aux_data = NULL; + aux_data_len = 0; + } + fscache_stat(&fscache_n_acquires); /* if there's no parent cookie, then we don't create one here either */ @@ -79,41 +295,31 @@ struct fscache_cookie *__fscache_acquire_cookie( } /* validate the definition */ - BUG_ON(!def->get_key); BUG_ON(!def->name[0]); BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && - parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); + parent->type != FSCACHE_COOKIE_TYPE_INDEX); - /* allocate and initialise a cookie */ - cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); - if (!cookie) { + candidate = fscache_alloc_cookie(parent, def, + index_key, index_key_len, + aux_data, aux_data_len, + netfs_data, object_size); + if (!candidate) { fscache_stat(&fscache_n_acquires_oom); _leave(" [ENOMEM]"); return NULL; } - atomic_set(&cookie->usage, 1); - atomic_set(&cookie->n_children, 0); - - /* We keep the active count elevated until relinquishment to prevent an - * attempt to wake up every time the object operations queue quiesces. - */ - atomic_set(&cookie->n_active, 1); - - atomic_inc(&parent->usage); - atomic_inc(&parent->n_children); + cookie = fscache_hash_cookie(candidate); + if (!cookie) { + trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + goto out; + } - cookie->def = def; - cookie->parent = parent; - cookie->netfs_data = netfs_data; - cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); + if (cookie == candidate) + candidate = NULL; - /* radix tree insertion won't use the preallocation pool unless it's - * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - - switch (cookie->def->type) { + switch (cookie->type) { case FSCACHE_COOKIE_TYPE_INDEX: fscache_stat(&fscache_n_cookie_index); break; @@ -125,16 +331,19 @@ struct fscache_cookie *__fscache_acquire_cookie( break; } + trace_fscache_acquire(cookie); + if (enable) { /* if the object is an index then we need do nothing more here * - we create indices on disk when we need them as an index * may exist in multiple caches */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie) == 0) { + if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } else { atomic_dec(&parent->n_children); - __fscache_cookie_put(cookie); + fscache_cookie_put(cookie, + fscache_cookie_put_acquire_nobufs); fscache_stat(&fscache_n_acquires_nobufs); _leave(" = NULL"); return NULL; @@ -145,7 +354,9 @@ struct fscache_cookie *__fscache_acquire_cookie( } fscache_stat(&fscache_n_acquires_ok); - _leave(" = %p", cookie); + +out: + fscache_free_cookie(candidate); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); @@ -154,24 +365,30 @@ EXPORT_SYMBOL(__fscache_acquire_cookie); * Enable a cookie to permit it to accept new operations. */ void __fscache_enable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + loff_t object_size, bool (*can_enable)(void *data), void *data) { _enter("%p", cookie); + trace_fscache_enable(cookie); + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, TASK_UNINTERRUPTIBLE); + fscache_update_aux(cookie, aux_data); + if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; if (can_enable && !can_enable(data)) { /* The netfs decided it didn't want to enable after all */ - } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { /* Wait for outstanding disablement to complete */ __fscache_wait_on_invalidate(cookie); - if (fscache_acquire_non_index_cookie(cookie) == 0) + if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } else { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); @@ -188,11 +405,11 @@ EXPORT_SYMBOL(__fscache_enable_cookie); * - this must make sure the index chain is instantiated and instantiate the * object representation too */ -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, + loff_t object_size) { struct fscache_object *object; struct fscache_cache *cache; - uint64_t i_size; int ret; _enter(""); @@ -231,9 +448,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) return ret; } - /* pass on how big the object we're caching is supposed to be */ - cookie->def->get_attr(cookie->netfs_data, &i_size); - spin_lock(&cookie->lock); if (hlist_empty(&cookie->backing_objects)) { spin_unlock(&cookie->lock); @@ -243,7 +457,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); - fscache_set_store_limit(object, i_size); + fscache_set_store_limit(object, object_size); /* initiate the process of looking up all the objects in the chain * (done by fscache_initialise_object()) */ @@ -318,7 +532,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, * attached to the cookie */ if (fscache_attach_object(cookie, object) < 0) { fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object); + cache->ops->put_object(object, fscache_obj_put_attach_fail); fscache_stat_d(&fscache_n_cop_put_object); } @@ -338,7 +552,7 @@ object_already_extant: error_put: fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object); + cache->ops->put_object(object, fscache_obj_put_alloc_fail); fscache_stat_d(&fscache_n_cop_put_object); error: _leave(" = %d", ret); @@ -398,7 +612,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* attach to the cookie */ object->cookie = cookie; - atomic_inc(&cookie->usage); + fscache_cookie_get(cookie, fscache_cookie_get_attach_object); hlist_add_head(&object->cookie_link, &cookie->backing_objects); fscache_objlist_add(object); @@ -426,10 +640,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie) * there, and if it's doing that, it may as well just retire the * cookie. */ - ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); - - /* We will be updating the cookie too. */ - BUG_ON(!cookie->def->get_aux); + ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); /* If there's an object, we tell the object state machine to handle the * invalidation on our behalf, otherwise there's nothing to do. @@ -473,7 +684,7 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate); /* * update the index entries backing a cookie */ -void __fscache_update_cookie(struct fscache_cookie *cookie) +void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) { struct fscache_object *object; @@ -487,10 +698,10 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) _enter("{%s}", cookie->def->name); - BUG_ON(!cookie->def->get_aux); - spin_lock(&cookie->lock); + fscache_update_aux(cookie, aux_data); + if (fscache_cookie_enabled(cookie)) { /* update the index entry on disk in each cache backing this * cookie. @@ -509,13 +720,17 @@ EXPORT_SYMBOL(__fscache_update_cookie); /* * Disable a cookie to stop it from accepting new requests from the netfs. */ -void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) +void __fscache_disable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool invalidate) { struct fscache_object *object; bool awaken = false; _enter("%p,%u", cookie, invalidate); + trace_fscache_disable(cookie); + ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { @@ -526,6 +741,9 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, TASK_UNINTERRUPTIBLE); + + fscache_update_aux(cookie, aux_data); + if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; @@ -557,12 +775,13 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) * n_active reaches 0). This makes sure outstanding reads and writes * have completed. */ - if (!atomic_dec_and_test(&cookie->n_active)) - wait_on_atomic_t(&cookie->n_active, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + if (!atomic_dec_and_test(&cookie->n_active)) { + wait_var_event(&cookie->n_active, + !atomic_read(&cookie->n_active)); + } /* Make sure any pending writes are cancelled. */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) fscache_invalidate_writes(cookie); /* Reset the cookie state if it wasn't relinquished */ @@ -584,7 +803,9 @@ EXPORT_SYMBOL(__fscache_disable_cookie); * - all dependents of this cookie must have already been unregistered * (indices/files/pages) */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool retire) { fscache_stat(&fscache_n_relinquishes); if (retire) @@ -600,10 +821,13 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) cookie, cookie->def->name, cookie->netfs_data, atomic_read(&cookie->n_active), retire); + trace_fscache_relinquish(cookie, retire); + /* No further netfs-accessing operations on this cookie permitted */ - set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) + BUG(); - __fscache_disable_cookie(cookie, retire); + __fscache_disable_cookie(cookie, aux_data, retire); /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; @@ -618,35 +842,54 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) /* Dispose of the netfs's link to the cookie */ ASSERTCMP(atomic_read(&cookie->usage), >, 0); - fscache_cookie_put(cookie); + fscache_cookie_put(cookie, fscache_cookie_put_relinquish); _leave(""); } EXPORT_SYMBOL(__fscache_relinquish_cookie); /* - * destroy a cookie + * Remove a cookie from the hash table. */ -void __fscache_cookie_put(struct fscache_cookie *cookie) +static void fscache_unhash_cookie(struct fscache_cookie *cookie) +{ + struct hlist_bl_head *h; + unsigned int bucket; + + bucket = cookie->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1); + h = &fscache_cookie_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_del(&cookie->hash_link); + hlist_bl_unlock(h); +} + +/* + * Drop a reference to a cookie. + */ +void fscache_cookie_put(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) { struct fscache_cookie *parent; + int usage; _enter("%p", cookie); - for (;;) { - _debug("FREE COOKIE %p", cookie); - parent = cookie->parent; - BUG_ON(!hlist_empty(&cookie->backing_objects)); - kmem_cache_free(fscache_cookie_jar, cookie); + do { + usage = atomic_dec_return(&cookie->usage); + trace_fscache_cookie(cookie, where, usage); - if (!parent) - break; + if (usage > 0) + return; + BUG_ON(usage < 0); + + parent = cookie->parent; + fscache_unhash_cookie(cookie); + fscache_free_cookie(cookie); cookie = parent; - BUG_ON(atomic_read(&cookie->usage) <= 0); - if (!atomic_dec_and_test(&cookie->usage)) - break; - } + where = fscache_cookie_put_parent; + } while (cookie); _leave(""); } @@ -656,7 +899,8 @@ void __fscache_cookie_put(struct fscache_cookie *cookie) * * NOTE: it only serves no-index type */ -int __fscache_check_consistency(struct fscache_cookie *cookie) +int __fscache_check_consistency(struct fscache_cookie *cookie, + const void *aux_data) { struct fscache_operation *op; struct fscache_object *object; @@ -665,7 +909,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) _enter("%p,", cookie); - ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; @@ -677,13 +921,16 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL, NULL); + fscache_operation_init(cookie, op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); + trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency); spin_lock(&cookie->lock); + fscache_update_aux(cookie, aux_data); + if (!fscache_cookie_enabled(cookie) || hlist_empty(&cookie->backing_objects)) goto inconsistent; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 5a117df2a9ef..aa46e48d8c75 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -13,16 +13,11 @@ #include <linux/module.h> #include "internal.h" -static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax); - -static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax); - static enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen); + uint16_t datalen, + loff_t object_size); /* * The root index is owned by FS-Cache itself. @@ -60,6 +55,7 @@ struct fscache_cookie fscache_fsdef_index = { .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, .flags = 1 << FSCACHE_COOKIE_ENABLED, + .type = FSCACHE_COOKIE_TYPE_INDEX, }; EXPORT_SYMBOL(fscache_fsdef_index); @@ -71,59 +67,18 @@ EXPORT_SYMBOL(fscache_fsdef_index); struct fscache_cookie_def fscache_fsdef_netfs_def = { .name = "FSDEF.netfs", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = fscache_fsdef_netfs_get_key, - .get_aux = fscache_fsdef_netfs_get_aux, .check_aux = fscache_fsdef_netfs_check_aux, }; /* - * get the key data for an FSDEF index record - this is the name of the netfs - * for which this entry is created - */ -static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct fscache_netfs *netfs = cookie_netfs_data; - unsigned klen; - - _enter("{%s.%u},", netfs->name, netfs->version); - - klen = strlen(netfs->name); - if (klen > bufmax) - return 0; - - memcpy(buffer, netfs->name, klen); - return klen; -} - -/* - * get the auxiliary data for an FSDEF index record - this is the index - * structure version number of the netfs for which this version is created - */ -static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct fscache_netfs *netfs = cookie_netfs_data; - unsigned dlen; - - _enter("{%s.%u},", netfs->name, netfs->version); - - dlen = sizeof(uint32_t); - if (dlen > bufmax) - return 0; - - memcpy(buffer, &netfs->version, dlen); - return dlen; -} - -/* * check that the index structure version number stored in the auxiliary data * matches the one the netfs gave us */ static enum fscache_checkaux fscache_fsdef_netfs_check_aux( void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct fscache_netfs *netfs = cookie_netfs_data; uint32_t version; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 0ff4b49a0037..500650f938fe 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -29,6 +29,7 @@ #define pr_fmt(fmt) "FS-Cache: " fmt #include <linux/fscache-cache.h> +#include <trace/events/fscache.h> #include <linux/sched.h> #define FSCACHE_MIN_THREADS 4 @@ -48,8 +49,16 @@ extern struct fscache_cache *fscache_select_cache_for_object( */ extern struct kmem_cache *fscache_cookie_jar; +extern void fscache_free_cookie(struct fscache_cookie *); extern void fscache_cookie_init_once(void *); -extern void __fscache_cookie_put(struct fscache_cookie *); +extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, + const struct fscache_cookie_def *, + const void *, size_t, + const void *, size_t, + void *, loff_t); +extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *); +extern void fscache_cookie_put(struct fscache_cookie *, + enum fscache_cookie_trace); /* * fsdef.c @@ -311,14 +320,12 @@ static inline void fscache_raise_event(struct fscache_object *object, fscache_enqueue_object(object); } -/* - * drop a reference to a cookie - */ -static inline void fscache_cookie_put(struct fscache_cookie *cookie) +static inline void fscache_cookie_get(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) { - BUG_ON(atomic_read(&cookie->usage) <= 0); - if (atomic_dec_and_test(&cookie->usage)) - __fscache_cookie_put(cookie); + int usage = atomic_inc_return(&cookie->usage); + + trace_fscache_cookie(cookie, where, usage); } /* @@ -342,6 +349,27 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context) cookie->def->put_context(cookie->netfs_data, context); } +/* + * Update the auxiliary data on a cookie. + */ +static inline +void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data) +{ + void *p; + + if (!aux_data) + return; + if (cookie->aux_len <= sizeof(cookie->inline_aux)) + p = cookie->inline_aux; + else + p = cookie->aux; + + if (memcmp(p, aux_data, cookie->aux_len) != 0) { + memcpy(p, aux_data, cookie->aux_len); + set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags); + } +} + /*****************************************************************************/ /* * debug tracing diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 249968dcbf5c..7dce110bf17d 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -16,6 +16,7 @@ #include <linux/completion.h> #include <linux/slab.h> #include <linux/seq_file.h> +#define CREATE_TRACE_POINTS #include "internal.h" MODULE_DESCRIPTION("FS Cache Manager"); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index a8aa00be4444..c2f605483cc5 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -14,69 +14,51 @@ #include <linux/slab.h> #include "internal.h" -static LIST_HEAD(fscache_netfs_list); - /* * register a network filesystem for caching */ int __fscache_register_netfs(struct fscache_netfs *netfs) { - struct fscache_netfs *ptr; - struct fscache_cookie *cookie; - int ret; + struct fscache_cookie *candidate, *cookie; _enter("{%s}", netfs->name); - INIT_LIST_HEAD(&netfs->link); - /* allocate a cookie for the primary index */ - cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); - - if (!cookie) { + candidate = fscache_alloc_cookie(&fscache_fsdef_index, + &fscache_fsdef_netfs_def, + netfs->name, strlen(netfs->name), + &netfs->version, sizeof(netfs->version), + netfs, 0); + if (!candidate) { _leave(" = -ENOMEM"); return -ENOMEM; } - /* initialise the primary index cookie */ - atomic_set(&cookie->usage, 1); - atomic_set(&cookie->n_children, 0); - atomic_set(&cookie->n_active, 1); - - cookie->def = &fscache_fsdef_netfs_def; - cookie->parent = &fscache_fsdef_index; - cookie->netfs_data = netfs; - cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; - - spin_lock_init(&cookie->lock); - spin_lock_init(&cookie->stores_lock); - INIT_HLIST_HEAD(&cookie->backing_objects); + candidate->flags = 1 << FSCACHE_COOKIE_ENABLED; /* check the netfs type is not already present */ - down_write(&fscache_addremove_sem); - - ret = -EEXIST; - list_for_each_entry(ptr, &fscache_netfs_list, link) { - if (strcmp(ptr->name, netfs->name) == 0) - goto already_registered; + cookie = fscache_hash_cookie(candidate); + if (!cookie) + goto already_registered; + if (cookie != candidate) { + trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + fscache_free_cookie(candidate); } - atomic_inc(&cookie->parent->usage); + fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs); atomic_inc(&cookie->parent->n_children); netfs->primary_index = cookie; - list_add(&netfs->link, &fscache_netfs_list); - ret = 0; pr_notice("Netfs '%s' registered for caching\n", netfs->name); + trace_fscache_netfs(netfs); + _leave(" = 0"); + return 0; already_registered: - up_write(&fscache_addremove_sem); - - if (ret < 0) - kmem_cache_free(fscache_cookie_jar, cookie); - - _leave(" = %d", ret); - return ret; + fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs); + _leave(" = -EEXIST"); + return -EEXIST; } EXPORT_SYMBOL(__fscache_register_netfs); @@ -88,15 +70,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs) { _enter("{%s.%u}", netfs->name, netfs->version); - down_write(&fscache_addremove_sem); - - list_del(&netfs->link); - fscache_relinquish_cookie(netfs->primary_index, 0); - - up_write(&fscache_addremove_sem); - - pr_notice("Netfs '%s' unregistered from caching\n", - netfs->name); + fscache_relinquish_cookie(netfs->primary_index, NULL, false); + pr_notice("Netfs '%s' unregistered from caching\n", netfs->name); _leave(""); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 0438d4cd91ef..43e6e28c164f 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -36,8 +36,6 @@ struct fscache_objlist_data { #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ #define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ #define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ - - u8 buf[512]; /* key and aux data buffer */ }; /* @@ -170,7 +168,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) struct fscache_cookie *cookie; unsigned long config = data->config; char _type[3], *type; - u8 *buf = data->buf, *p; + u8 *p; if ((unsigned long) v == 1) { seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" @@ -254,7 +252,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) if (fscache_use_cookie(obj)) { uint16_t keylen = 0, auxlen = 0; - switch (cookie->def->type) { + switch (cookie->type) { case 0: type = "IX"; break; @@ -263,7 +261,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) break; default: snprintf(_type, sizeof(_type), "%02u", - cookie->def->type); + cookie->type); type = _type; break; } @@ -274,30 +272,30 @@ static int fscache_objlist_show(struct seq_file *m, void *v) cookie->flags, cookie->netfs_data); - if (cookie->def->get_key && - config & FSCACHE_OBJLIST_CONFIG_KEY) - keylen = cookie->def->get_key(cookie->netfs_data, - buf, 400); + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = cookie->key_len; - if (cookie->def->get_aux && - config & FSCACHE_OBJLIST_CONFIG_AUX) - auxlen = cookie->def->get_aux(cookie->netfs_data, - buf + keylen, 512 - keylen); - fscache_unuse_cookie(obj); + if (config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = cookie->aux_len; if (keylen > 0 || auxlen > 0) { seq_puts(m, " "); - for (p = buf; keylen > 0; keylen--) + p = keylen <= sizeof(cookie->inline_key) ? + cookie->inline_key : cookie->key; + for (; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); if (auxlen > 0) { if (config & FSCACHE_OBJLIST_CONFIG_KEY) seq_puts(m, ", "); + p = auxlen <= sizeof(cookie->inline_aux) ? + cookie->inline_aux : cookie->aux; for (; auxlen > 0; auxlen--) seq_printf(m, "%02x", *p++); } } seq_puts(m, "\n"); + fscache_unuse_cookie(obj); } else { seq_puts(m, "<no_netfs>\n"); } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 7a182c87f378..1085ca12e25c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -138,10 +138,13 @@ static const struct fscache_transition fscache_osm_run_oob[] = { { 0, NULL } }; -static int fscache_get_object(struct fscache_object *); -static void fscache_put_object(struct fscache_object *); +static int fscache_get_object(struct fscache_object *, + enum fscache_obj_ref_trace); +static void fscache_put_object(struct fscache_object *, + enum fscache_obj_ref_trace); static bool fscache_enqueue_dependents(struct fscache_object *, int); static void fscache_dequeue_object(struct fscache_object *); +static void fscache_update_aux_data(struct fscache_object *); /* * we need to notify the parent when an op completes that we had outstanding @@ -170,6 +173,7 @@ static void fscache_object_sm_dispatcher(struct fscache_object *object) const struct fscache_transition *t; const struct fscache_state *state, *new_state; unsigned long events, event_mask; + bool oob; int event = -1; ASSERT(object != NULL); @@ -188,6 +192,7 @@ restart_masked: if (events & object->oob_event_mask) { _debug("{OBJ%x} oob %lx", object->debug_id, events & object->oob_event_mask); + oob = true; for (t = object->oob_table; t->events; t++) { if (events & t->events) { state = t->transit_to; @@ -199,6 +204,7 @@ restart_masked: } } } + oob = false; /* Wait states are just transition tables */ if (!state->work) { @@ -207,6 +213,8 @@ restart_masked: if (events & t->events) { new_state = t->transit_to; event = fls(events & t->events) - 1; + trace_fscache_osm(object, state, + true, false, event); clear_bit(event, &object->events); _debug("{OBJ%x} ev %d: %s -> %s", object->debug_id, event, @@ -226,6 +234,7 @@ restart_masked: execute_work_state: _debug("{OBJ%x} exec %s", object->debug_id, state->name); + trace_fscache_osm(object, state, false, oob, event); new_state = state->work(object, event); event = -1; if (new_state == NO_TRANSIT) { @@ -279,7 +288,7 @@ static void fscache_object_work_func(struct work_struct *work) start = jiffies; fscache_object_sm_dispatcher(object); fscache_hist(fscache_objs_histogram, start); - fscache_put_object(object); + fscache_put_object(object, fscache_obj_put_work); } /** @@ -397,7 +406,7 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje fscache_stat(&fscache_n_cop_grab_object); success = false; if (fscache_object_is_live(parent) && - object->cache->ops->grab_object(object)) { + object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) { list_add(&object->dep_link, &parent->dependents); success = true; } @@ -703,6 +712,11 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob ASSERT(cookie != NULL); ASSERT(!hlist_unhashed(&object->cookie_link)); + if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) { + _debug("final update"); + fscache_update_aux_data(object); + } + /* Make sure the cookie no longer points here and that the netfs isn't * waiting for us. */ @@ -745,7 +759,7 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob } /* this just shifts the object release to the work processor */ - fscache_put_object(object); + fscache_put_object(object, fscache_obj_put_drop_obj); fscache_stat(&fscache_n_object_dead); _leave(""); @@ -755,12 +769,13 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob /* * get a ref on an object */ -static int fscache_get_object(struct fscache_object *object) +static int fscache_get_object(struct fscache_object *object, + enum fscache_obj_ref_trace why) { int ret; fscache_stat(&fscache_n_cop_grab_object); - ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN; fscache_stat_d(&fscache_n_cop_grab_object); return ret; } @@ -768,10 +783,11 @@ static int fscache_get_object(struct fscache_object *object) /* * Discard a ref on an object */ -static void fscache_put_object(struct fscache_object *object) +static void fscache_put_object(struct fscache_object *object, + enum fscache_obj_ref_trace why) { fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object); + object->cache->ops->put_object(object, why); fscache_stat_d(&fscache_n_cop_put_object); } @@ -786,7 +802,7 @@ void fscache_object_destroy(struct fscache_object *object) fscache_objlist_remove(object); /* We can get rid of the cookie now */ - fscache_cookie_put(object->cookie); + fscache_cookie_put(object->cookie, fscache_cookie_put_object); object->cookie = NULL; } EXPORT_SYMBOL(fscache_object_destroy); @@ -798,7 +814,7 @@ void fscache_enqueue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); - if (fscache_get_object(object) >= 0) { + if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { wait_queue_head_t *cong_wq = &get_cpu_var(fscache_object_cong_wait); @@ -806,7 +822,7 @@ void fscache_enqueue_object(struct fscache_object *object) if (fscache_object_congested()) wake_up(cong_wq); } else - fscache_put_object(object); + fscache_put_object(object, fscache_obj_put_queue); put_cpu_var(fscache_object_cong_wait); } @@ -866,7 +882,7 @@ static bool fscache_enqueue_dependents(struct fscache_object *object, int event) list_del_init(&dep->dep_link); fscache_raise_event(dep, event); - fscache_put_object(dep); + fscache_put_object(dep, fscache_obj_put_enq_dep); if (!list_empty(&object->dependents) && need_resched()) { ret = false; @@ -906,7 +922,8 @@ static void fscache_dequeue_object(struct fscache_object *object) * and creation). */ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, uint16_t datalen) + const void *data, uint16_t datalen, + loff_t object_size) { enum fscache_checkaux result; @@ -916,7 +933,7 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, } result = object->cookie->def->check_aux(object->cookie->netfs_data, - data, datalen); + data, datalen, object_size); switch (result) { /* entry okay as is */ case FSCACHE_CHECKAUX_OKAY: @@ -972,11 +989,12 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, + fscache_operation_init(cookie, op, object->cache->ops->invalidate_object, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); + trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate); spin_lock(&cookie->lock); if (fscache_submit_exclusive_op(object, op) < 0) @@ -1026,6 +1044,17 @@ static const struct fscache_state *fscache_invalidate_object(struct fscache_obje } /* + * Update auxiliary data. + */ +static void fscache_update_aux_data(struct fscache_object *object) +{ + fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); + object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); +} + +/* * Asynchronously update an object. */ static const struct fscache_state *fscache_update_object(struct fscache_object *object, @@ -1033,10 +1062,7 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * { _enter("{OBJ%x},%d", object->debug_id, event); - fscache_stat(&fscache_n_updates_run); - fscache_stat(&fscache_n_cop_update_object); - object->cache->ops->update_object(object); - fscache_stat_d(&fscache_n_cop_update_object); + fscache_update_aux_data(object); _leave(""); return transit_to(WAIT_FOR_CMD); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index de67745e1cd7..e30c5975ea58 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -32,7 +32,8 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op) * Do basic initialisation of an operation. The caller must still set flags, * object and processor if needed. */ -void fscache_operation_init(struct fscache_operation *op, +void fscache_operation_init(struct fscache_cookie *cookie, + struct fscache_operation *op, fscache_operation_processor_t processor, fscache_operation_cancel_t cancel, fscache_operation_release_t release) @@ -46,6 +47,7 @@ void fscache_operation_init(struct fscache_operation *op, op->release = release; INIT_LIST_HEAD(&op->pend_link); fscache_stat(&fscache_n_op_initialised); + trace_fscache_op(cookie, op, fscache_op_init); } EXPORT_SYMBOL(fscache_operation_init); @@ -59,6 +61,8 @@ EXPORT_SYMBOL(fscache_operation_init); */ void fscache_enqueue_operation(struct fscache_operation *op) { + struct fscache_cookie *cookie = op->object->cookie; + _enter("{OBJ%x OP%x,%u}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); @@ -71,12 +75,14 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { case FSCACHE_OP_ASYNC: + trace_fscache_op(cookie, op, fscache_op_enqueue_async); _debug("queue async"); atomic_inc(&op->usage); if (!queue_work(fscache_op_wq, &op->work)) fscache_put_operation(op); break; case FSCACHE_OP_MYTHREAD: + trace_fscache_op(cookie, op, fscache_op_enqueue_mythread); _debug("queue for caller's attention"); break; default: @@ -101,6 +107,8 @@ static void fscache_run_op(struct fscache_object *object, wake_up_bit(&op->flags, FSCACHE_OP_WAITING); if (op->processor) fscache_enqueue_operation(op); + else + trace_fscache_op(object->cookie, op, fscache_op_run); fscache_stat(&fscache_n_op_run); } @@ -155,6 +163,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object, _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + trace_fscache_op(object->cookie, op, fscache_op_submit_ex); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -240,6 +250,8 @@ int fscache_submit_op(struct fscache_object *object, _enter("{OBJ%x OP%x},{%u}", object->debug_id, op->debug_id, atomic_read(&op->usage)); + trace_fscache_op(object->cookie, op, fscache_op_submit); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -357,6 +369,8 @@ int fscache_cancel_op(struct fscache_operation *op, _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + trace_fscache_op(object->cookie, op, fscache_op_cancel); + ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -419,6 +433,8 @@ void fscache_cancel_all_ops(struct fscache_object *object) fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); + trace_fscache_op(object->cookie, op, fscache_op_cancel_all); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; @@ -454,9 +470,11 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); if (!cancelled) { + trace_fscache_op(object->cookie, op, fscache_op_completed); op->state = FSCACHE_OP_ST_COMPLETE; } else { op->cancel(op); + trace_fscache_op(object->cookie, op, fscache_op_cancelled); op->state = FSCACHE_OP_ST_CANCELLED; } @@ -488,6 +506,8 @@ void fscache_put_operation(struct fscache_operation *op) if (!atomic_dec_and_test(&op->usage)) return; + trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put); + _debug("PUT OP"); ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && op->state != FSCACHE_OP_ST_COMPLETE, @@ -563,6 +583,8 @@ void fscache_operation_gc(struct work_struct *work) spin_unlock(&cache->op_gc_list_lock); object = op->object; + trace_fscache_op(object->cookie, op, fscache_op_gc); + spin_lock(&object->lock); _debug("GC DEFERRED REL OBJ%x OP%x", @@ -601,6 +623,8 @@ void fscache_op_work_func(struct work_struct *work) _enter("{OBJ%x OP%x,%d}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + trace_fscache_op(op->object->cookie, op, fscache_op_work); + ASSERT(op->processor != NULL); start = jiffies; op->processor(op); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 961029e04027..111349f67d98 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -27,6 +27,7 @@ bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); rcu_read_unlock(); + trace_fscache_check_page(cookie, page, val, 0); return val != NULL; } @@ -39,6 +40,8 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa { wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + trace_fscache_page(cookie, page, fscache_page_write_wait); + wait_event(*wq, !__fscache_check_page_write(cookie, page)); } EXPORT_SYMBOL(__fscache_wait_on_page_write); @@ -69,6 +72,8 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, _enter("%p,%p,%x", cookie, page, gfp); + trace_fscache_page(cookie, page, fscache_page_maybe_release); + try_again: rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); @@ -101,6 +106,7 @@ try_again: } xpage = radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); spin_unlock(&cookie->stores_lock); if (xpage) { @@ -112,6 +118,7 @@ try_again: } wake_up_bit(&cookie->flags, 0); + trace_fscache_wake_cookie(cookie); if (xpage) put_page(xpage); __fscache_uncache_page(cookie, page); @@ -144,7 +151,7 @@ static void fscache_end_page_write(struct fscache_object *object, struct page *page) { struct fscache_cookie *cookie; - struct page *xpage = NULL; + struct page *xpage = NULL, *val; spin_lock(&object->lock); cookie = object->cookie; @@ -154,13 +161,24 @@ static void fscache_end_page_write(struct fscache_object *object, spin_lock(&cookie->stores_lock); radix_tree_tag_clear(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG); + trace_fscache_page(cookie, page, fscache_page_radix_clear_store); if (!radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG)) { fscache_stat(&fscache_n_store_radix_deletes); xpage = radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); + trace_fscache_page(cookie, page, fscache_page_write_end); + + val = radix_tree_lookup(&cookie->stores, page->index); + trace_fscache_check_page(cookie, page, val, 1); + } else { + trace_fscache_page(cookie, page, fscache_page_write_end_pend); } spin_unlock(&cookie->stores_lock); wake_up_bit(&cookie->flags, 0); + trace_fscache_wake_cookie(cookie); + } else { + trace_fscache_page(cookie, page, fscache_page_write_end_noc); } spin_unlock(&object->lock); if (xpage) @@ -185,9 +203,11 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat_d(&fscache_n_cop_attr_changed); if (ret < 0) fscache_abort_object(object); + fscache_op_complete(op, ret < 0); + } else { + fscache_op_complete(op, true); } - fscache_op_complete(op, true); _leave(""); } @@ -213,7 +233,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); + fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL); + trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -297,7 +318,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, + fscache_operation_init(cookie, &op->op, NULL, fscache_do_cancel_retrieval, fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | @@ -368,6 +389,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { + trace_fscache_op(object->cookie, op, fscache_op_signal); ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -389,6 +411,7 @@ check_if_dead: if (unlikely(fscache_object_is_dying(object) || fscache_cache_is_broken(object))) { enum fscache_operation_state state = op->state; + trace_fscache_op(object->cookie, op, fscache_op_signal); fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); @@ -443,6 +466,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, return -ENOMEM; } atomic_set(&op->n_pages, 1); + trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one); spin_lock(&cookie->lock); @@ -571,6 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (!op) return -ENOMEM; atomic_set(&op->n_pages, *nr_pages); + trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); spin_lock(&cookie->lock); @@ -682,6 +707,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, if (!op) return -ENOMEM; atomic_set(&op->n_pages, 1); + trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one); spin_lock(&cookie->lock); @@ -776,15 +802,17 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); +again: spin_lock(&object->lock); cookie = object->cookie; if (!fscache_object_is_active(object)) { - /* If we get here, then the on-disk cache object likely longer - * exists, so we should just cancel this write operation. + /* If we get here, then the on-disk cache object likely no + * longer exists, so we should just cancel this write + * operation. */ spin_unlock(&object->lock); - fscache_op_complete(&op->op, false); + fscache_op_complete(&op->op, true); _leave(" [inactive]"); return; } @@ -797,7 +825,7 @@ static void fscache_write_op(struct fscache_operation *_op) * cancel this write operation. */ spin_unlock(&object->lock); - fscache_op_complete(&op->op, false); + fscache_op_complete(&op->op, true); _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", _op->flags, _op->state, object->state->short_name, object->flags); @@ -809,30 +837,33 @@ static void fscache_write_op(struct fscache_operation *_op) fscache_stat(&fscache_n_store_calls); /* find a page to store */ + results[0] = NULL; page = NULL; n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, FSCACHE_COOKIE_PENDING_TAG); + trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit); if (n != 1) goto superseded; page = results[0]; _debug("gang %d [%lx]", n, page->index); - if (page->index >= op->store_limit) { - fscache_stat(&fscache_n_store_pages_over_limit); - goto superseded; - } radix_tree_tag_set(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG); radix_tree_tag_clear(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); + trace_fscache_page(cookie, page, fscache_page_radix_pend2store); spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); + if (page->index >= op->store_limit) + goto discard_page; + fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); fscache_stat_d(&fscache_n_cop_write_page); + trace_fscache_wrote_page(cookie, page, &op->op, ret); fscache_end_page_write(object, page); if (ret < 0) { fscache_abort_object(object); @@ -844,6 +875,12 @@ static void fscache_write_op(struct fscache_operation *_op) _leave(""); return; +discard_page: + fscache_stat(&fscache_n_store_pages_over_limit); + trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS); + fscache_end_page_write(object, page); + goto again; + superseded: /* this writer is going away and there aren't any more things to * write */ @@ -851,7 +888,7 @@ superseded: spin_unlock(&cookie->stores_lock); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); spin_unlock(&object->lock); - fscache_op_complete(&op->op, true); + fscache_op_complete(&op->op, false); _leave(""); } @@ -879,6 +916,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) for (i = n - 1; i >= 0; i--) { page = results[i]; radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); + trace_fscache_page(cookie, page, fscache_page_inval); } spin_unlock(&cookie->stores_lock); @@ -888,6 +927,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) } wake_up_bit(&cookie->flags, 0); + trace_fscache_wake_cookie(cookie); _leave(""); } @@ -923,6 +963,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) */ int __fscache_write_page(struct fscache_cookie *cookie, struct page *page, + loff_t object_size, gfp_t gfp) { struct fscache_storage *op; @@ -946,7 +987,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, NULL, + fscache_operation_init(cookie, &op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | @@ -956,6 +997,8 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (ret < 0) goto nomem_free; + trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one); + ret = -ENOBUFS; spin_lock(&cookie->lock); @@ -967,9 +1010,15 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) goto nobufs; + trace_fscache_page(cookie, page, fscache_page_write); + /* add the page to the pending-storage radix tree on the backing * object */ spin_lock(&object->lock); + + if (object->store_limit_l != object_size) + fscache_set_store_limit(object, object_size); + spin_lock(&cookie->stores_lock); _debug("store limit %llx", (unsigned long long) object->store_limit); @@ -982,8 +1031,10 @@ int __fscache_write_page(struct fscache_cookie *cookie, goto nobufs_unlock_obj; } + trace_fscache_page(cookie, page, fscache_page_radix_insert); radix_tree_tag_set(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); + trace_fscache_page(cookie, page, fscache_page_radix_set_pend); get_page(page); /* we only want one writer at a time, but we do need to queue new @@ -1026,6 +1077,7 @@ already_pending: submit_failed: spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); spin_unlock(&cookie->stores_lock); wake_cookie = __fscache_unuse_cookie(cookie); put_page(page); @@ -1072,6 +1124,8 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) if (!PageFsCache(page)) goto done; + trace_fscache_page(cookie, page, fscache_page_uncache); + /* get the object */ spin_lock(&cookie->lock); @@ -1120,6 +1174,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) atomic_inc(&fscache_n_marks); #endif + trace_fscache_page(cookie, page, fscache_page_cached); + _debug("- mark %p{%lx}", page, page->index); if (TestSetPageFsCache(page)) { static bool once_only; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 7ac6e839b065..fcc8c2f2690e 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -21,7 +21,6 @@ atomic_t fscache_n_op_pend; atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; -atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 624f18bbfd2b..ef309958e060 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1080,6 +1080,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; file = fget(d.fd); err = -EINVAL; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 2f725b4a386b..f58716567972 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -940,13 +940,13 @@ failed: } /** - * gfs2_set_page_dirty - Page dirtying function + * jdata_set_page_dirty - Page dirtying function * @page: The page to dirty * * Returns: 1 if it dirtyed the page, or 0 otherwise */ -static int gfs2_set_page_dirty(struct page *page) +static int jdata_set_page_dirty(struct page *page) { SetPageChecked(page); return __set_page_dirty_buffers(page); @@ -1214,7 +1214,7 @@ static const struct address_space_operations gfs2_ordered_aops = { .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, .write_end = gfs2_write_end, - .set_page_dirty = gfs2_set_page_dirty, + .set_page_dirty = __set_page_dirty_buffers, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -1231,7 +1231,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, .write_end = gfs2_write_end, - .set_page_dirty = gfs2_set_page_dirty, + .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 86d6a4435c87..685c305cbeb6 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -491,14 +491,12 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct super_block *sb = sdp->sd_vfs; struct buffer_head *dibh = mp->mp_bh[0]; u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = mp->mp_fheight - 1; - int ret; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; @@ -607,15 +605,6 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, iomap->flags |= IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); - if (flags & IOMAP_ZERO) { - ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits, - dblks, GFP_NOFS); - if (ret) { - fs_err(sdp, - "Failed to zero data buffers\n"); - flags &= ~IOMAP_ZERO; - } - } break; } } while (iomap->addr == IOMAP_NULL_ADDR); @@ -807,23 +796,27 @@ do_alloc: iomap->length = hole_size(inode, lblock, &mp); else iomap->length = size - pos; - } else { - if (height <= ip->i_height) - iomap->length = hole_size(inode, lblock, &mp); } goto out_release; } /** - * gfs2_block_map - Map a block from an inode to a disk block + * gfs2_block_map - Map one or more blocks of an inode to a disk block * @inode: The inode * @lblock: The logical block number * @bh_map: The bh to be mapped * @create: True if its ok to alloc blocks to satify the request * - * Sets buffer_mapped() if successful, sets buffer_boundary() if a - * read of metadata will be required before the next block can be - * mapped. Sets buffer_new() if new blocks were allocated. + * The size of the requested mapping is defined in bh_map->b_size. + * + * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged + * when @lblock is not mapped. Sets buffer_mapped(bh_map) and + * bh_map->b_size to indicate the size of the mapping when @lblock and + * successive blocks are mapped, up to the requested size. + * + * Sets buffer_boundary() if a read of metadata will be required + * before the next block can be mapped. Sets buffer_new() if new + * blocks were allocated. * * Returns: errno */ @@ -842,8 +835,6 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, if (create) flags |= IOMAP_WRITE; - if (buffer_zeronew(bh_map)) - flags |= IOMAP_ZERO; ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits, bh_map->b_size, flags, &iomap); if (ret) { @@ -1347,6 +1338,7 @@ static inline bool walk_done(struct gfs2_sbd *sdp, static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 maxsize = sdp->sd_heightsize[ip->i_height]; struct metapath mp = {}; struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; @@ -1362,6 +1354,14 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) u64 prev_bnr = 0; __be64 *start, *end; + if (offset >= maxsize) { + /* + * The starting point lies beyond the allocated meta-data; + * there are no blocks do deallocate. + */ + return 0; + } + /* * The start position of the hole is defined by lblock, start_list, and * start_aligned. The end position of the hole is defined by lend, @@ -1375,7 +1375,6 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) */ if (length) { - u64 maxsize = sdp->sd_heightsize[ip->i_height]; u64 end_offset = offset + length; u64 lend; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 7c21aea0266b..d9fb0ad6cc30 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1940,7 +1940,6 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, { struct buffer_head *bh; struct gfs2_dirent *dent; - int error; dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); if (!dent) { @@ -1953,18 +1952,10 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_trans_add_meta(dip->i_gl, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); - - if (dip->i_diskflags & GFS2_DIF_EXHASH) { - brelse(bh); - error = gfs2_meta_inode_buffer(dip, &bh); - if (error) - return error; - gfs2_trans_add_meta(dip->i_gl, bh); - } + brelse(bh); dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode); - gfs2_dinode_out(dip, bh->b_data); - brelse(bh); + mark_inode_dirty_sync(&dip->i_inode); return 0; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4f88e201b3f0..4b71f021a9e2 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -729,11 +729,12 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, int mode) { + struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); + loff_t end = offset + len; struct buffer_head *dibh; + struct iomap iomap; int error; - unsigned int nr_blks; - sector_t lblock = offset >> inode->i_blkbits; error = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(error)) @@ -747,21 +748,19 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, goto out; } - while (len) { - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - bh_map.b_size = len; - set_buffer_zeronew(&bh_map); - - error = gfs2_block_map(inode, lblock, &bh_map, 1); - if (unlikely(error)) + while (offset < end) { + error = gfs2_iomap_begin(inode, offset, end - offset, + IOMAP_WRITE, &iomap); + if (error) goto out; - len -= bh_map.b_size; - nr_blks = bh_map.b_size >> inode->i_blkbits; - lblock += nr_blks; - if (!buffer_new(&bh_map)) + offset = iomap.offset + iomap.length; + if (iomap.type != IOMAP_HOLE) continue; - if (unlikely(!buffer_zeronew(&bh_map))) { - error = -EIO; + error = sb_issue_zeroout(sb, iomap.addr >> inode->i_blkbits, + iomap.length >> inode->i_blkbits, + GFP_NOFS); + if (error) { + fs_err(GFS2_SB(inode), "Failed to zero data buffers\n"); goto out; } } @@ -809,7 +808,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes, max_blks = UINT_MAX; + loff_t bytes, max_bytes, max_blks; int error; const loff_t pos = offset; const loff_t count = len; @@ -861,7 +860,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t return error; /* ap.allowed tells us how many blocks quota will allow * us to write. Check if this reduces max_blks */ - if (ap.allowed && ap.allowed < max_blks) + max_blks = UINT_MAX; + if (ap.allowed) max_blks = ap.allowed; error = gfs2_inplace_reserve(ip, &ap); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e0557b8a590a..1b6b1e3f5caf 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -130,15 +130,12 @@ static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, - BH_Zeronew = BH_PrivateStart + 2, }; BUFFER_FNS(Pinned, pinned) TAS_BUFFER_FNS(Pinned, pinned) BUFFER_FNS(Escaped, escaped) TAS_BUFFER_FNS(Escaped, escaped) -BUFFER_FNS(Zeronew, zeronew) -TAS_BUFFER_FNS(Zeronew, zeronew) struct gfs2_bufdata { struct buffer_head *bd_bh; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 59e0560180ec..8700eb815638 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1326,19 +1326,11 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, int dir_rename) { - int error; - struct buffer_head *dibh; - if (dir_rename) return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); + mark_inode_dirty_sync(&ip->i_inode); return 0; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index cf6b46247df4..0248835625f1 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, * */ -void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 93b52ac1ca1f..1862e310a067 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -70,7 +70,6 @@ extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 5e47c935a515..836f29480be6 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -45,6 +45,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int ret; + + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b6b258998bcd..d8b622c375ab 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -15,6 +15,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/crc32c.h> +#include <linux/ktime.h> #include "gfs2.h" #include "incore.h" @@ -409,12 +410,13 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; struct gfs2_holder j_gh, ji_gh, thaw_gh; - unsigned long t; + ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; int error; int jlocked = 0; + t_start = ktime_get(); if (sdp->sd_args.ar_spectator || (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", @@ -446,6 +448,7 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); } + t_jlck = ktime_get(); fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); error = gfs2_jdesc_check(jd); @@ -455,13 +458,12 @@ void gfs2_recover_func(struct work_struct *work) error = gfs2_find_jhead(jd, &head); if (error) goto fail_gunlock_ji; + t_jhd = ktime_get(); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", jd->jd_jid); - t = jiffies; - /* Acquire a shared hold on the freeze lock */ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, @@ -495,6 +497,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } + t_tlck = ktime_get(); fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); for (pass = 0; pass < 2; pass++) { @@ -509,9 +512,14 @@ void gfs2_recover_func(struct work_struct *work) clean_journal(jd, &head); gfs2_glock_dq_uninit(&thaw_gh); - t = DIV_ROUND_UP(jiffies - t, HZ); - fs_info(sdp, "jid=%u: Journal replayed in %lus\n", - jd->jd_jid, t); + t_rep = ktime_get(); + fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " + "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", + jd->jd_jid, ktime_ms_delta(t_rep, t_start), + ktime_ms_delta(t_jlck, t_start), + ktime_ms_delta(t_jhd, t_jlck), + ktime_ms_delta(t_tlck, t_jhd), + ktime_ms_delta(t_rep, t_tlck)); } gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 620be0521866..cf5c7f3080d2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -800,7 +800,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; - if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + if (!(flags & I_DIRTY_INODE)) return; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) return; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index b9318b49ff8f..cb10b95efe0f 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -515,6 +515,7 @@ TRACE_EVENT(gfs2_iomap_end, __field( u64, inum ) __field( loff_t, offset ) __field( ssize_t, length ) + __field( sector_t, pblock ) __field( u16, flags ) __field( u16, type ) __field( int, ret ) @@ -525,16 +526,20 @@ TRACE_EVENT(gfs2_iomap_end, __entry->inum = ip->i_no_addr; __entry->offset = iomap->offset; __entry->length = iomap->length; + __entry->pblock = iomap->addr == IOMAP_NULL_ADDR ? 0 : + (iomap->addr >> ip->i_inode.i_blkbits); __entry->flags = iomap->flags; __entry->type = iomap->type; __entry->ret = ret; ), - TP_printk("%u,%u bmap %llu iomap end %llu/%lu ty:%d flags:%08x rc:%d", + TP_printk("%u,%u bmap %llu iomap end %llu/%lu to %llu ty:%d flags:%08x rc:%d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->offset, - (unsigned long)__entry->length, (u16)__entry->type, + (unsigned long)__entry->length, + (long long)__entry->pblock, + (u16)__entry->type, (u16)__entry->flags, __entry->ret) ); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 05de20954659..f2bce1e0f6fb 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -308,7 +308,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, } ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -768,7 +768,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_end_trans; ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); out_end_trans: gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -896,7 +896,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, ea_set_remove_stuffed(ip, es->es_el); ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); return error; @@ -1114,7 +1114,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) } ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index ffaec2e7526c..cb8374af08a6 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -84,7 +84,7 @@ extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd); extern int make_symlink(const char *from, const char *to); extern int unlink_file(const char *file); extern int do_mkdir(const char *file, int mode); -extern int do_rmdir(const char *file); +extern int hostfs_do_rmdir(const char *file); extern int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor); extern int link_file(const char *from, const char *to); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index c148e7f4f451..3cd85eb5bbb1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -706,7 +706,7 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) if ((file = dentry_name(dentry)) == NULL) return -ENOMEM; - err = do_rmdir(file); + err = hostfs_do_rmdir(file); __putname(file); return err; } diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 9c1e0f019880..5ecc4706172b 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -304,7 +304,7 @@ int do_mkdir(const char *file, int mode) return 0; } -int do_rmdir(const char *file) +int hostfs_do_rmdir(const char *file) { int err; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8fe1b0aa2896..d508c7844681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Mask used when checking the page offset value passed in via system + * calls. This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value. The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ + (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); @@ -127,12 +137,17 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &hugetlb_vm_ops; /* - * Offset passed to mmap (before page shift) could have been - * negative when represented as a (l)off_t. + * page based offset in vm_pgoff could be sufficiently large to + * overflow a loff_t when converted to byte offset. This can + * only happen on architectures where sizeof(loff_t) == + * sizeof(unsigned long). So, only check in those instances. */ - if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) - return -EINVAL; + if (sizeof(unsigned long) == sizeof(loff_t)) { + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + return -EINVAL; + } + /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; diff --git a/fs/inode.c b/fs/inode.c index ef362364d396..b153aeaa61ea 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -346,9 +346,8 @@ void inc_nlink(struct inode *inode) } EXPORT_SYMBOL(inc_nlink); -void address_space_init_once(struct address_space *mapping) +static void __address_space_init_once(struct address_space *mapping) { - memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&mapping->tree_lock); init_rwsem(&mapping->i_mmap_rwsem); @@ -356,6 +355,12 @@ void address_space_init_once(struct address_space *mapping) spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT_CACHED; } + +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + __address_space_init_once(mapping); +} EXPORT_SYMBOL(address_space_init_once); /* @@ -371,7 +376,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - address_space_init_once(&inode->i_data); + __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); @@ -1533,7 +1538,6 @@ retry: if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { atomic_inc(&inode->i_count); - inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); diff --git a/fs/internal.h b/fs/internal.h index df262f41a0ef..e08972db0303 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,7 +55,15 @@ extern void __init chrdev_init(void); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); +long do_mknodat(int dfd, const char __user *filename, umode_t mode, + unsigned int dev); +long do_mkdirat(int dfd, const char __user *pathname, umode_t mode); +long do_rmdir(int dfd, const char __user *pathname); long do_unlinkat(int dfd, struct filename *name); +long do_symlinkat(const char __user *oldname, int newdfd, + const char __user *newname); +int do_linkat(int olddfd, const char __user *oldname, int newdfd, + const char __user *newname, int flags); /* * namespace.c @@ -111,7 +119,12 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); -extern int open_check_o_direct(struct file *f); +long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +long do_faccessat(int dfd, const char __user *filename, int mode); +int do_fchmodat(int dfd, const char __user *filename, umode_t mode); +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag); + extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/ioctl.c b/fs/ioctl.c index 5ace7efb0d04..4823431d1c9d 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -689,7 +689,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, return error; } -SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { int error; struct fd f = fdget(fd); @@ -702,3 +702,8 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) fdput(f); return error; } + +SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +{ + return ksys_ioctl(fd, cmd, arg); +} diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 3fbf48ec2188..dfb057900e79 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -974,7 +974,7 @@ out: } /* - * This is a variaon of __jbd2_update_log_tail which checks for validity of + * This is a variation of __jbd2_update_log_tail which checks for validity of * provided log tail and locks j_checkpoint_mutex. So it is safe against races * with other threads updating log tail. */ @@ -1417,6 +1417,9 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, journal_superblock_t *sb = journal->j_superblock; int ret; + if (is_journal_aborted(journal)) + return -EIO; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); @@ -1483,12 +1486,15 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) void jbd2_journal_update_sb_errno(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + int errcode; read_lock(&journal->j_state_lock); - jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", - journal->j_errno); - sb->s_errno = cpu_to_be32(journal->j_errno); + errcode = journal->j_errno; read_unlock(&journal->j_state_lock); + if (errcode == -ESHUTDOWN) + errcode = 0; + jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); + sb->s_errno = cpu_to_be32(errcode); jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); } @@ -2105,12 +2111,22 @@ void __jbd2_journal_abort_hard(journal_t *journal) * but don't do any other IO. */ static void __journal_abort_soft (journal_t *journal, int errno) { - if (journal->j_flags & JBD2_ABORT) - return; + int old_errno; - if (!journal->j_errno) + write_lock(&journal->j_state_lock); + old_errno = journal->j_errno; + if (!journal->j_errno || errno == -ESHUTDOWN) journal->j_errno = errno; + if (journal->j_flags & JBD2_ABORT) { + write_unlock(&journal->j_state_lock); + if (!old_errno && old_errno != -ESHUTDOWN && + errno == -ESHUTDOWN) + jbd2_journal_update_sb_errno(journal); + return; + } + write_unlock(&journal->j_state_lock); + __jbd2_journal_abort_hard(journal); if (errno) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index f99910b69c78..a4967b27ffb6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -600,8 +600,8 @@ static int do_one_pass(journal_t *journal, success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " "checksum recovering " - "block %llu in log\n", - blocknr); + "data block %llu in " + "log\n", blocknr); block_error = 1; goto skip_write; } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 4a6cf289be24..83b8f06b4a64 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -21,14 +21,6 @@ #include <linux/pagemap.h> #include "nodelist.h" -struct erase_priv_struct { - struct jffs2_eraseblock *jeb; - struct jffs2_sb_info *c; -}; - -#ifndef __ECOS -static void jffs2_erase_callback(struct erase_info *); -#endif static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset); static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); @@ -51,7 +43,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n", __func__, jeb->offset, jeb->offset, jeb->offset + c->sector_size); - instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); + instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL); if (!instr) { pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); mutex_lock(&c->erase_free_sem); @@ -67,18 +59,15 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, memset(instr, 0, sizeof(*instr)); - instr->mtd = c->mtd; instr->addr = jeb->offset; instr->len = c->sector_size; - instr->callback = jffs2_erase_callback; - instr->priv = (unsigned long)(&instr[1]); - - ((struct erase_priv_struct *)instr->priv)->jeb = jeb; - ((struct erase_priv_struct *)instr->priv)->c = c; ret = mtd_erase(c->mtd, instr); - if (!ret) + if (!ret) { + jffs2_erase_succeeded(c, jeb); + kfree(instr); return; + } bad_offset = instr->fail_addr; kfree(instr); @@ -214,22 +203,6 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock wake_up(&c->erase_wait); } -#ifndef __ECOS -static void jffs2_erase_callback(struct erase_info *instr) -{ - struct erase_priv_struct *priv = (void *)instr->priv; - - if(instr->state != MTD_ERASE_DONE) { - pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", - (unsigned long long)instr->addr, instr->state); - jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); - } else { - jffs2_erase_succeeded(priv->c, priv->jeb); - } - kfree(instr); -} -#endif /* !__ECOS */ - /* Hmmm. Maybe we should accept the extra space it takes and make this a standard doubly-linked list? */ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9c36d614bf89..346ed161756d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,8 +57,8 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); -DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); +static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); unsigned int lockd_net_id; diff --git a/fs/locks.c b/fs/locks.c index d6ff4beb70ce..62bbe8b31f26 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -559,7 +559,7 @@ static const struct lock_manager_operations lease_manager_ops = { * Initialize a lease, use the default lock manager operations */ static int lease_init(struct file *filp, long type, struct file_lock *fl) - { +{ if (assign_type(fl, type) != 0) return -EINVAL; diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index f2a0cfcef11d..bcd53a79156f 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -18,7 +18,7 @@ config MINIX_FS config MINIX_FS_NATIVE_ENDIAN def_bool MINIX_FS - depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + depends on MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED def_bool MINIX_FS diff --git a/fs/namei.c b/fs/namei.c index 921ae32dbc80..a66ed5a1622a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -39,6 +39,7 @@ #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> +#include <linux/build_bug.h> #include "internal.h" #include "mount.h" @@ -130,6 +131,7 @@ getname_flags(const char __user *filename, int flags, int *empty) struct filename *result; char *kname; int len; + BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0); result = audit_reusename(filename); if (result) @@ -559,9 +561,10 @@ static int __nd_alloc_stack(struct nameidata *nd) static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; + struct super_block *sb = mnt->mnt_sb; - /* Only bind mounts can have disconnected paths */ - if (mnt->mnt_root == mnt->mnt_sb->s_root) + /* Bind mounts and multi-root filesystems can have disconnected paths */ + if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); @@ -926,7 +929,8 @@ static inline int may_follow_link(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) return -ECHILD; - audit_log_link_denied("follow_link", &nd->stack[0].link); + audit_inode(nd->name, nd->stack[0].link.dentry, 0); + audit_log_link_denied("follow_link"); return -EACCES; } @@ -992,7 +996,7 @@ static int may_linkat(struct path *link) if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; - audit_log_link_denied("linkat", link); + audit_log_link_denied("linkat"); return -EPERM; } @@ -1473,43 +1477,36 @@ static struct dentry *lookup_dcache(const struct qstr *name, } /* - * Call i_op->lookup on the dentry. The dentry must be negative and - * unhashed. - * - * dir->d_inode->i_mutex must be held + * Parent directory has inode locked exclusive. This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. */ -static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *old; - - /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(dir))) { - dput(dentry); - return ERR_PTR(-ENOENT); - } - - old = dir->i_op->lookup(dir, dentry, flags); - if (unlikely(old)) { - dput(dentry); - dentry = old; - } - return dentry; -} - static struct dentry *__lookup_hash(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); + struct dentry *old; + struct inode *dir = base->d_inode; if (dentry) return dentry; + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(dir))) + return ERR_PTR(-ENOENT); + dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); - return lookup_real(base->d_inode, dentry, flags); + old = dir->i_op->lookup(dir, dentry, flags); + if (unlikely(old)) { + dput(dentry); + dentry = old; + } + return dentry; } static int lookup_fast(struct nameidata *nd, @@ -3380,9 +3377,7 @@ finish_open_created: goto out; *opened |= FILE_OPENED; opened: - error = open_check_o_direct(file); - if (!error) - error = ima_file_check(file, op->acc_mode, *opened); + error = ima_file_check(file, op->acc_mode, *opened); if (!error && will_truncate) error = handle_truncate(file); out: @@ -3462,9 +3457,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = finish_open(file, child, NULL, opened); if (error) goto out2; - error = open_check_o_direct(file); - if (error) - fput(file); out2: mnt_drop_write(path.mnt); out: @@ -3728,8 +3720,8 @@ static int may_mknod(umode_t mode) } } -SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, - unsigned, dev) +long do_mknodat(int dfd, const char __user *filename, umode_t mode, + unsigned int dev) { struct dentry *dentry; struct path path; @@ -3772,9 +3764,15 @@ out: return error; } +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, + unsigned int, dev) +{ + return do_mknodat(dfd, filename, mode, dev); +} + SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { - return sys_mknodat(AT_FDCWD, filename, mode, dev); + return do_mknodat(AT_FDCWD, filename, mode, dev); } int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -3803,7 +3801,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } EXPORT_SYMBOL(vfs_mkdir); -SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +long do_mkdirat(int dfd, const char __user *pathname, umode_t mode) { struct dentry *dentry; struct path path; @@ -3828,9 +3826,14 @@ retry: return error; } +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +{ + return do_mkdirat(dfd, pathname, mode); +} + SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { - return sys_mkdirat(AT_FDCWD, pathname, mode); + return do_mkdirat(AT_FDCWD, pathname, mode); } int vfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -3872,7 +3875,7 @@ out: } EXPORT_SYMBOL(vfs_rmdir); -static long do_rmdir(int dfd, const char __user *pathname) +long do_rmdir(int dfd, const char __user *pathname) { int error = 0; struct filename *name; @@ -4108,8 +4111,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) } EXPORT_SYMBOL(vfs_symlink); -SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, - int, newdfd, const char __user *, newname) +long do_symlinkat(const char __user *oldname, int newdfd, + const char __user *newname) { int error; struct filename *from; @@ -4139,9 +4142,15 @@ out_putname: return error; } +SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + return do_symlinkat(oldname, newdfd, newname); +} + SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { - return sys_symlinkat(oldname, AT_FDCWD, newname); + return do_symlinkat(oldname, AT_FDCWD, newname); } /** @@ -4233,8 +4242,8 @@ EXPORT_SYMBOL(vfs_link); * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ -SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname, int, flags) +int do_linkat(int olddfd, const char __user *oldname, int newdfd, + const char __user *newname, int flags) { struct dentry *new_dentry; struct path old_path, new_path; @@ -4298,9 +4307,15 @@ out: return error; } +SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, int, flags) +{ + return do_linkat(olddfd, oldname, newdfd, newname, flags); +} + SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { - return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } /** @@ -4478,8 +4493,8 @@ out: } EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname, unsigned int, flags) +static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, + const char __user *newname, unsigned int flags) { struct dentry *old_dentry, *new_dentry; struct dentry *trap; @@ -4621,15 +4636,21 @@ exit: return error; } +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) +{ + return do_renameat2(olddfd, oldname, newdfd, newname, flags); +} + SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { - return sys_renameat2(olddfd, oldname, newdfd, newname, 0); + return do_renameat2(olddfd, oldname, newdfd, newname, 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } int vfs_whiteout(struct inode *dir, struct dentry *dentry) diff --git a/fs/namespace.c b/fs/namespace.c index 9d1374ab6e06..e398f32d7541 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1680,7 +1680,7 @@ static inline bool may_mandlock(void) * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD */ -SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +int ksys_umount(char __user *name, int flags) { struct path path; struct mount *mnt; @@ -1720,6 +1720,11 @@ out: return retval; } +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +{ + return ksys_umount(name, flags); +} + #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* @@ -1727,7 +1732,7 @@ out: */ SYSCALL_DEFINE1(oldumount, char __user *, name) { - return sys_umount(name, 0); + return ksys_umount(name, 0); } #endif @@ -3032,8 +3037,8 @@ struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) } EXPORT_SYMBOL(mount_subtree); -SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, - char __user *, type, unsigned long, flags, void __user *, data) +int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type, + unsigned long flags, void __user *data) { int ret; char *kernel_type; @@ -3066,6 +3071,12 @@ out_type: return ret; } +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) +{ + return ksys_mount(dev_name, dir_name, type, flags, data); +} + /* * Return true if path is reachable from root * diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2435af56b87e..a50d7813e3ea 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -572,7 +572,7 @@ out: } static bool -validate_bitmap_values(unsigned long mask) +validate_bitmap_values(unsigned int mask) { return (mask & ~RCA4_TYPE_MASK_ALL) == 0; } @@ -596,17 +596,15 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, goto out; status = cpu_to_be32(NFS4_OK); - if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG)) flags = FMODE_READ; - if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG)) flags |= FMODE_WRITE; - if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) - &args->craa_type_mask)) - pnfs_recall_all_layouts(cps->clp); if (flags) nfs_expire_unused_delegation_types(cps->clp, flags); + + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) + pnfs_recall_all_layouts(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8c10b0562e75..621c517b325c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -86,10 +86,10 @@ struct nfs_direct_req { struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; int mirror_count; + loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ - io_start, /* start of IO */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 0ee4b93d36ea..1c5d8d31fc0a 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -50,59 +50,6 @@ void nfs_fscache_unregister(void) } /* - * Layout of the key for an NFS server cache object. - */ -struct nfs_server_key { - uint16_t nfsversion; /* NFS protocol version */ - uint16_t family; /* address family */ - uint16_t port; /* IP port */ - union { - struct in_addr ipv4_addr; /* IPv4 address */ - struct in6_addr ipv6_addr; /* IPv6 address */ - } addr[0]; -}; - -/* - * Generate a key to describe a server in the main NFS index - * - We return the length of the key, or 0 if we can't generate one - */ -static uint16_t nfs_server_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_client *clp = cookie_netfs_data; - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; - const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; - struct nfs_server_key *key = buffer; - uint16_t len = sizeof(struct nfs_server_key); - - memset(key, 0, len); - key->nfsversion = clp->rpc_ops->version; - key->family = clp->cl_addr.ss_family; - - switch (clp->cl_addr.ss_family) { - case AF_INET: - key->port = sin->sin_port; - key->addr[0].ipv4_addr = sin->sin_addr; - len += sizeof(key->addr[0].ipv4_addr); - break; - - case AF_INET6: - key->port = sin6->sin6_port; - key->addr[0].ipv6_addr = sin6->sin6_addr; - len += sizeof(key->addr[0].ipv6_addr); - break; - - default: - printk(KERN_WARNING "NFS: Unknown network family '%d'\n", - clp->cl_addr.ss_family); - len = 0; - break; - } - - return len; -} - -/* * Define the server object for FS-Cache. This is used to describe a server * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and * server address parameters. @@ -110,33 +57,9 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data, const struct fscache_cookie_def nfs_fscache_server_index_def = { .name = "NFS.server", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = nfs_server_get_key, }; /* - * Generate a key to describe a superblock key in the main NFS index - */ -static uint16_t nfs_super_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_fscache_key *key; - const struct nfs_server *nfss = cookie_netfs_data; - uint16_t len; - - key = nfss->fscache_key; - len = sizeof(key->key) + key->key.uniq_len; - if (len > bufmax) { - len = 0; - } else { - memcpy(buffer, &key->key, sizeof(key->key)); - memcpy(buffer + sizeof(key->key), - key->key.uniquifier, key->key.uniq_len); - } - - return len; -} - -/* * Define the superblock object for FS-Cache. This is used to describe a * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS * parameters that might cause a separate superblock. @@ -144,84 +67,9 @@ static uint16_t nfs_super_get_key(const void *cookie_netfs_data, const struct fscache_cookie_def nfs_fscache_super_index_def = { .name = "NFS.super", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = nfs_super_get_key, }; /* - * Definition of the auxiliary data attached to NFS inode storage objects - * within the cache. - * - * The contents of this struct are recorded in the on-disk local cache in the - * auxiliary data attached to the data storage object backing an inode. This - * permits coherency to be managed when a new inode binds to an already extant - * cache object. - */ -struct nfs_fscache_inode_auxdata { - struct timespec mtime; - struct timespec ctime; - loff_t size; - u64 change_attr; -}; - -/* - * Generate a key to describe an NFS inode in an NFS server's index - */ -static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_inode *nfsi = cookie_netfs_data; - uint16_t nsize; - - /* use the inode's NFS filehandle as the key */ - nsize = nfsi->fh.size; - memcpy(buffer, nfsi->fh.data, nsize); - return nsize; -} - -/* - * Get certain file attributes from the netfs data - * - This function can be absent for an index - * - Not permitted to return an error - * - The netfs data from the cookie being used as the source is presented - */ -static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct nfs_inode *nfsi = cookie_netfs_data; - - *size = nfsi->vfs_inode.i_size; -} - -/* - * Get the auxiliary data from netfs data - * - This function can be absent if the index carries no state data - * - Should store the auxiliary data in the buffer - * - Should return the amount of amount stored - * - Not permitted to return an error - * - The netfs data from the cookie being used as the source is presented - */ -static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct nfs_fscache_inode_auxdata auxdata; - const struct nfs_inode *nfsi = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.size = nfsi->vfs_inode.i_size; - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); - - if (bufmax > sizeof(auxdata)) - bufmax = sizeof(auxdata); - - memcpy(buffer, &auxdata, bufmax); - return bufmax; -} - -/* * Consult the netfs about the state of an object * - This function can be absent if the index carries no state data * - The netfs data from the cookie being used as the target is @@ -230,7 +78,8 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, static enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = cookie_netfs_data; @@ -239,7 +88,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.size = nfsi->vfs_inode.i_size; auxdata.mtime = nfsi->vfs_inode.i_mtime; auxdata.ctime = nfsi->vfs_inode.i_ctime; @@ -288,9 +136,6 @@ static void nfs_fh_put_context(void *cookie_netfs_data, void *context) const struct fscache_cookie_def nfs_fscache_inode_object_def = { .name = "NFS.fh", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = nfs_fscache_inode_get_key, - .get_attr = nfs_fscache_inode_get_attr, - .get_aux = nfs_fscache_inode_get_aux, .check_aux = nfs_fscache_inode_check_aux, .get_context = nfs_fh_get_context, .put_context = nfs_fh_put_context, diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index d63bea8bbfbb..b55fc7920c3b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -18,6 +18,7 @@ #include <linux/in6.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/iversion.h> #include "internal.h" #include "iostat.h" @@ -29,6 +30,21 @@ static struct rb_root nfs_fscache_keys = RB_ROOT; static DEFINE_SPINLOCK(nfs_fscache_keys_lock); /* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + struct { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + __be16 port; /* IP port */ + } hdr; + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + }; +} __packed; + +/* * Get the per-client index cookie for an NFS client if the appropriate mount * flag was set * - We always try and get an index cookie for the client, but get filehandle @@ -36,10 +52,41 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock); */ void nfs_fscache_get_client_cookie(struct nfs_client *clp) { + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key key; + uint16_t len = sizeof(key.hdr); + + memset(&key, 0, sizeof(key)); + key.hdr.nfsversion = clp->rpc_ops->version; + key.hdr.family = clp->cl_addr.ss_family; + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key.hdr.port = sin->sin_port; + key.ipv4_addr = sin->sin_addr; + len += sizeof(key.ipv4_addr); + break; + + case AF_INET6: + key.hdr.port = sin6->sin6_port; + key.ipv6_addr = sin6->sin6_addr; + len += sizeof(key.ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + clp->fscache = NULL; + return; + } + /* create a cache index for looking up filehandles */ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, &nfs_fscache_server_index_def, - clp, true); + &key, len, + NULL, 0, + clp, 0, true); dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", clp, clp->fscache); } @@ -52,7 +99,7 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp) dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", clp, clp->fscache); - fscache_relinquish_cookie(clp->fscache, 0); + fscache_relinquish_cookie(clp->fscache, NULL, false); clp->fscache = NULL; } @@ -139,7 +186,9 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - nfss, true); + key, sizeof(*key) + ulen, + NULL, 0, + nfss, 0, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); return; @@ -163,7 +212,7 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); - fscache_relinquish_cookie(nfss->fscache, 0); + fscache_relinquish_cookie(nfss->fscache, NULL, false); nfss->fscache = NULL; if (nfss->fscache_key) { @@ -180,14 +229,25 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) */ void nfs_fscache_init_inode(struct inode *inode) { + struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); nfsi->fscache = NULL; if (!S_ISREG(inode->i_mode)) return; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, &nfs_fscache_inode_object_def, - nfsi, false); + nfsi->fh.data, nfsi->fh.size, + &auxdata, sizeof(auxdata), + nfsi, nfsi->vfs_inode.i_size, false); } /* @@ -195,12 +255,16 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { + struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(cookie, false); + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -232,20 +296,26 @@ static bool nfs_fscache_can_enable(void *data) */ void nfs_fscache_open_file(struct inode *inode, struct file *filp) { + struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); if (!fscache_cookie_valid(cookie)) return; + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); clear_bit(NFS_INO_FSCACHE, &nfsi->flags); - fscache_disable_cookie(cookie, true); + fscache_disable_cookie(cookie, &auxdata, true); fscache_uncache_all_inode_pages(cookie, inode); } else { dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); - fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size, + nfs_fscache_can_enable, inode); if (fscache_cookie_enabled(cookie)) set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); } @@ -422,7 +492,8 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", nfs_i_fscache(inode), page, page->index, page->flags, sync); - ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); + ret = fscache_write_page(nfs_i_fscache(inode), page, + inode->i_size, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index d7fe3e799f2f..161ba2edb9d0 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -57,6 +57,21 @@ struct nfs_fscache_key { }; /* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + u64 change_attr; +}; + +/* * fscache-index.c */ extern struct fscache_netfs nfs_fscache_netfs; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7d893543cf3b..d17a90c4fa37 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -85,11 +85,6 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); -int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode) -{ - return nfs_wait_killable(mode); -} - /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 49f848fd1f04..7327930ad970 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -873,7 +873,7 @@ static void nfs3_nlm_release_call(void *data) } } -const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { +static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { .nlmclnt_alloc_call = nfs3_nlm_alloc_call, .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, .nlmclnt_release_call = nfs3_nlm_release_call, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 04612c24d394..979631411a0e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -868,8 +868,10 @@ static int nfs4_set_client(struct nfs_server *server, if (IS_ERR(clp)) return PTR_ERR(clp); - if (server->nfs_client == clp) + if (server->nfs_client == clp) { + nfs_put_client(clp); return -ELOOP; + } /* * Query for the lease time on clientid setup or renewal @@ -1244,11 +1246,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); return error; } + nfs_put_client(clp); if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 65c9c4175145..b993ad282de2 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,7 +52,6 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/fs_struct.h> #include "nfs4_fs.h" #include "internal.h" diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 18a7626ac638..67d19cd92e44 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -98,8 +98,8 @@ nfs_page_free(struct nfs_page *p) int nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, - TASK_KILLABLE); + return wait_var_event_killable(&l_ctx->io_count, + !atomic_read(&l_ctx->io_count)); } /** @@ -395,7 +395,7 @@ static void nfs_clear_request(struct nfs_page *req) } if (l_ctx != NULL) { if (atomic_dec_and_test(&l_ctx->io_count)) { - wake_up_atomic_t(&l_ctx->io_count); + wake_up_var(&l_ctx->io_count); if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c13e826614b5..ee723aa153a3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -292,8 +292,11 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { - struct inode *inode = lo->plh_inode; + struct inode *inode; + if (!lo) + return; + inode = lo->plh_inode; pnfs_layoutreturn_before_put_layout_hdr(lo); if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { @@ -1241,10 +1244,12 @@ retry: spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + lo = NULL; goto out_noroc; + } + pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { - pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); @@ -1312,10 +1317,12 @@ out_noroc: struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(args); + pnfs_put_layout_hdr(lo); return true; } if (layoutreturn) pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_put_layout_hdr(lo); return false; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 03aaa60c7768..32ba2d471853 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -245,7 +245,7 @@ pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, { if (list_empty(pages)) { if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) - wake_up_atomic_t(&cinfo->mds->rpcs_out); + wake_up_var(&cinfo->mds->rpcs_out); /* don't call nfs_commitdata_release - it tries to put * the open_context which is not acquired until nfs_init_commit * which has not been called on @data */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29bacdc56f6a..5e470e233c83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* initial superblock/root creation */ mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + if (!(server->flags & NFS_MOUNT_UNSHARED)) + s->s_iflags |= SB_I_MULTIROOT; } mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7428a669d7a7..6579f3b367bd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1620,8 +1620,8 @@ static void nfs_writeback_result(struct rpc_task *task, static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - return wait_on_atomic_t(&cinfo->rpcs_out, - nfs_wait_atomic_killable, TASK_KILLABLE); + return wait_var_event_killable(&cinfo->rpcs_out, + !atomic_read(&cinfo->rpcs_out)); } static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) @@ -1632,7 +1632,7 @@ static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) { if (atomic_dec_and_test(&cinfo->rpcs_out)) - wake_up_atomic_t(&cinfo->rpcs_out); + wake_up_var(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1876,40 +1876,43 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head, return status; } -int nfs_commit_inode(struct inode *inode, int how) +static int __nfs_commit_inode(struct inode *inode, int how, + struct writeback_control *wbc) { LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; - int error = 0; - int res; + int ret, nscan; nfs_init_cinfo_from_inode(&cinfo, inode); nfs_commit_begin(cinfo.mds); - res = nfs_scan_commit(inode, &head, &cinfo); - if (res) - error = nfs_generic_commit_list(inode, &head, how, &cinfo); + for (;;) { + ret = nscan = nfs_scan_commit(inode, &head, &cinfo); + if (ret <= 0) + break; + ret = nfs_generic_commit_list(inode, &head, how, &cinfo); + if (ret < 0) + break; + ret = 0; + if (wbc && wbc->sync_mode == WB_SYNC_NONE) { + if (nscan < wbc->nr_to_write) + wbc->nr_to_write -= nscan; + else + wbc->nr_to_write = 0; + } + if (nscan < INT_MAX) + break; + cond_resched(); + } nfs_commit_end(cinfo.mds); - if (res == 0) - return res; - if (error < 0) - goto out_error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_commit(cinfo.mds); - if (error < 0) - return error; - return res; -out_error: - res = error; - /* Note: If we exit without ensuring that the commit is complete, - * we must mark the inode as dirty. Otherwise, future calls to - * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure - * that the data is on the disk. - */ -out_mark_dirty: - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return res; + if (ret || !may_wait) + return ret; + return wait_on_commit(cinfo.mds); +} + +int nfs_commit_inode(struct inode *inode, int how) +{ + return __nfs_commit_inode(inode, how, NULL); } EXPORT_SYMBOL_GPL(nfs_commit_inode); @@ -1919,11 +1922,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int flags = FLUSH_SYNC; int ret = 0; - /* no commits means nothing needs to be done */ - if (!atomic_long_read(&nfsi->commit_info.ncommit)) - return ret; - if (wbc->sync_mode == WB_SYNC_NONE) { + /* no commits means nothing needs to be done */ + if (!atomic_long_read(&nfsi->commit_info.ncommit)) + goto check_requests_outstanding; + /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. */ @@ -1934,16 +1937,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) flags = 0; } - ret = nfs_commit_inode(inode, flags); - if (ret >= 0) { - if (wbc->sync_mode == WB_SYNC_NONE) { - if (ret < wbc->nr_to_write) - wbc->nr_to_write -= ret; - else - wbc->nr_to_write = 0; - } - return 0; - } + ret = __nfs_commit_inode(inode, flags, wbc); + if (!ret) { + if (flags & FLUSH_SYNC) + return 0; + } else if (atomic_long_read(&nfsi->commit_info.ncommit)) + goto out_mark_dirty; + +check_requests_outstanding: + if (!atomic_read(&nfsi->commit_info.rpcs_out)) + return ret; out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 1d0ce3c57d93..6259a4b8579f 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -192,6 +192,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) struct nfsd3_writeres *resp = rqstp->rq_resp; __be32 nfserr; unsigned long cnt = argp->len; + unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), @@ -201,9 +202,12 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; + nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + if (!nvecs) + RETURN_STATUS(nfserr_io); nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, resp->committed); + rqstp->rq_vec, nvecs, &cnt, + resp->committed); resp->count = cnt; RETURN_STATUS(nfserr); } @@ -279,6 +283,16 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; __be32 nfserr; + if (argp->tlen == 0) + RETURN_STATUS(nfserr_inval); + if (argp->tlen > NFS3_MAXPATHLEN) + RETURN_STATUS(nfserr_nametoolong); + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + argp->tlen); + if (IS_ERR(argp->tname)) + RETURN_STATUS(nfserrno(PTR_ERR(argp->tname))); + dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 1a70581e1cb2..3192b544a441 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -391,7 +391,7 @@ int nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, v, hdr, dlen; + unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); struct kvec *head = rqstp->rq_arg.head; struct kvec *tail = rqstp->rq_arg.tail; @@ -433,17 +433,9 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) args->count = max_blocksize; len = args->len = max_blocksize; } - rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = head->iov_len - hdr; - v = 0; - while (len > rqstp->rq_vec[v].iov_len) { - len -= rqstp->rq_vec[v].iov_len; - v++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); - rqstp->rq_vec[v].iov_len = PAGE_SIZE; - } - rqstp->rq_vec[v].iov_len = len; - args->vlen = v + 1; + + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; return 1; } @@ -489,51 +481,24 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_symlinkargs *args = rqstp->rq_argp; - unsigned int len, avail; - char *old, *new; - struct kvec *vec; + char *base = (char *)p; + size_t dlen; if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen)) - ) + !(p = decode_filename(p, &args->fname, &args->flen))) return 0; p = decode_sattr3(p, &args->attrs); - /* now decode the pathname, which might be larger than the first page. - * As we have to check for nul's anyway, we copy it into a new page - * This page appears in the rq_res.pages list, but as pages_len is always - * 0, it won't get in the way - */ - len = ntohl(*p++); - if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) - return 0; - args->tname = new = page_address(*(rqstp->rq_next_page++)); - args->tlen = len; - /* first copy and check from the first page */ - old = (char*)p; - vec = &rqstp->rq_arg.head[0]; - if ((void *)old > vec->iov_base + vec->iov_len) - return 0; - avail = vec->iov_len - (old - (char*)vec->iov_base); - while (len && avail && *old) { - *new++ = *old++; - len--; - avail--; - } - /* now copy next page if there is one */ - if (len && !avail && rqstp->rq_arg.page_len) { - avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE); - old = page_address(rqstp->rq_arg.pages[0]); - } - while (len && avail && *old) { - *new++ = *old++; - len--; - avail--; - } - *new = '\0'; - if (len) - return 0; + args->tlen = ntohl(*p++); + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + dlen = args->first.iov_len + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + if (dlen < XDR_QUADLEN(args->tlen) << 2) + return 0; return 1; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 49b0a9e7ff18..1f04d2a70d25 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -223,8 +223,8 @@ static int nfs_cb_stat_to_errno(int status) return -status; } -static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, - int *status) +static int decode_cb_op_status(struct xdr_stream *xdr, + enum nfs_cb_opnum4 expected, int *status) { __be32 *p; u32 op; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 7d888369f85a..228faf00a594 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -165,7 +165,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) struct nfs4_client *clp = ls->ls_stid.sc_client; struct nfs4_file *fp = ls->ls_stid.sc_file; - trace_layoutstate_free(&ls->ls_stid.sc_stateid); + trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); @@ -264,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); - trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -334,7 +334,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) if (list_empty(&ls->ls_layouts)) goto out_unlock; - trace_layout_recall(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); refcount_inc(&ls->ls_stid.sc_count); nfsd4_run_cb(&ls->ls_recall); @@ -507,7 +507,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, false, lrp->lr_layout_type, &ls); if (nfserr) { - trace_layout_return_lookup_fail(&lrp->lr_sid); + trace_nfsd_layout_return_lookup_fail(&lrp->lr_sid); return nfserr; } @@ -523,7 +523,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); lrp->lrs_present = 1; } else { - trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); + trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); nfs4_unhash_stid(&ls->ls_stid); lrp->lrs_present = 0; } @@ -694,7 +694,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) /* * Unknown error or non-responding client, we'll need to fence. */ - trace_layout_recall_fail(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); ops = nfsd4_layout_ops[ls->ls_layout_type]; if (ops->fence_client) @@ -703,7 +703,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) nfsd4_cb_layout_fail(ls); return -1; case -NFS4ERR_NOMATCHING_LAYOUT: - trace_layout_recall_done(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; return 1; } @@ -716,7 +716,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) container_of(cb, struct nfs4_layout_stateid, ls_recall); LIST_HEAD(reaplist); - trace_layout_recall_release(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall_release(&ls->ls_stid.sc_stateid); nfsd4_return_all_layouts(ls, &reaplist); nfsd4_free_layouts(&reaplist); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a0bed2b2004d..5d99e8810b85 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -32,6 +32,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/fs_struct.h> #include <linux/file.h> #include <linux/falloc.h> #include <linux/slab.h> @@ -252,11 +253,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * Note: create modes (UNCHECKED,GUARDED...) are the same * in NFSv4 as in v3 except EXCLUSIVE4_1. */ + current->fs->umask = open->op_umask; status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); + current->fs->umask = 0; if (!status && open->op_label.len) nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); @@ -603,6 +606,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; + current->fs->umask = create->cr_umask; switch (create->cr_type) { case NF4LNK: status = nfsd_symlink(rqstp, &cstate->current_fh, @@ -611,20 +615,22 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NF4BLK: + status = nfserr_inval; rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); if (MAJOR(rdev) != create->cr_specdata1 || MINOR(rdev) != create->cr_specdata2) - return nfserr_inval; + goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, &create->cr_iattr, S_IFBLK, rdev, &resfh); break; case NF4CHR: + status = nfserr_inval; rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); if (MAJOR(rdev) != create->cr_specdata1 || MINOR(rdev) != create->cr_specdata2) - return nfserr_inval; + goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, &create->cr_iattr,S_IFCHR, rdev, &resfh); @@ -668,6 +674,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_dup2(&cstate->current_fh, &resfh); out: fh_put(&resfh); +out_umask: + current->fs->umask = 0; return status; } @@ -751,6 +759,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; + trace_nfsd_read_start(rqstp, &cstate->current_fh, + read->rd_offset, read->rd_length); + /* * If we do a zero copy read, then a client will see read data * that reflects the state of the file *after* performing the @@ -783,6 +794,8 @@ nfsd4_read_release(union nfsd4_op_u *u) { if (u->read.rd_filp) fput(u->read.rd_filp); + trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, + u->read.rd_offset, u->read.rd_length); } static __be32 @@ -1001,6 +1014,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; + cnt = write->wr_buflen; + trace_nfsd_write_start(rqstp, &cstate->current_fh, + write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, stateid, WR_STATE, &filp, NULL); if (status) { @@ -1008,7 +1024,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); @@ -1021,7 +1036,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput(filp); write->wr_bytes_written = cnt; - + trace_nfsd_write_done(rqstp, &cstate->current_fh, + write->wr_offset, cnt); return status; } @@ -1106,7 +1122,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, else { copy->cp_res.wr_bytes_written = bytes; copy->cp_res.wr_stable_how = NFS_UNSTABLE; - copy->cp_consecutive = 1; copy->cp_synchronous = 1; gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); status = nfs_ok; @@ -1412,7 +1427,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, true, lgp->lg_layout_type, &ls); if (nfserr) { - trace_layout_get_lookup_fail(&lgp->lg_sid); + trace_nfsd_layout_get_lookup_fail(&lgp->lg_sid); goto out; } @@ -1481,7 +1496,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, false, lcp->lc_layout_type, &ls); if (nfserr) { - trace_layout_commit_lookup_fail(&lcp->lc_sid); + trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); /* fixup error code as per RFC5661 */ if (nfserr == nfserr_bad_stateid) nfserr = nfserr_badlayout; @@ -1714,12 +1729,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto encode_op; } + trace_nfsd_compound(rqstp, args->opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; - dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", - resp->opcnt, args->opcnt, op->opnum, - nfsd4_op_name(op->opnum)); /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -1793,9 +1806,8 @@ encode_op: status = op->status; } - dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", - args->ops, args->opcnt, resp->opcnt, op->opnum, - be32_to_cpu(status)); + trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, + nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); nfsd4_increment_op_stats(op->opnum); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 150521c9671b..fc74d6f46bd5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -98,6 +98,7 @@ enum nfsd4_st_mutex_lock_subclass { */ static DECLARE_WAIT_QUEUE_HEAD(close_wq); +static struct kmem_cache *client_slab; static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; @@ -268,6 +269,35 @@ free_blocked_lock(struct nfsd4_blocked_lock *nbl) kfree(nbl); } +static void +remove_blocked_locks(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; + LIST_HEAD(reaplist); + + /* Dequeue all blocked locks */ + spin_lock(&nn->blocked_locks_lock); + while (!list_empty(&lo->lo_blocked)) { + nbl = list_first_entry(&lo->lo_blocked, + struct nfsd4_blocked_lock, + nbl_list); + list_del_init(&nbl->nbl_list); + list_move(&nbl->nbl_lru, &reaplist); + } + spin_unlock(&nn->blocked_locks_lock); + + /* Now free them */ + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, + nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -777,7 +807,8 @@ static void block_delegations(struct knfsd_fh *fh) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, + struct svc_fh *current_fh, struct nfs4_clnt_odstate *odstate) { struct nfs4_delegation *dp; @@ -808,6 +839,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, dp->dl_retries = 1; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + get_nfs4_file(fp); + dp->dl_stid.sc_file = fp; return dp; out_dec: atomic_long_dec(&num_delegations); @@ -845,19 +878,35 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) spin_unlock(&stid->sc_lock); } -static void nfs4_put_deleg_lease(struct nfs4_file *fp) +static void put_deleg_file(struct nfs4_file *fp) { struct file *filp = NULL; spin_lock(&fp->fi_lock); - if (fp->fi_deleg_file && --fp->fi_delegees == 0) + if (--fp->fi_delegees == 0) swap(filp, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); - if (filp) { - vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp); + if (filp) fput(filp); - } +} + +static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) +{ + struct nfs4_file *fp = dp->dl_stid.sc_file; + struct file *filp = fp->fi_deleg_file; + + WARN_ON_ONCE(!fp->fi_delegees); + + vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + put_deleg_file(fp); +} + +static void destroy_unhashed_deleg(struct nfs4_delegation *dp) +{ + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_unlock_deleg_lease(dp); + nfs4_put_stid(&dp->dl_stid); } void nfs4_unhash_stid(struct nfs4_stid *s) @@ -866,20 +915,16 @@ void nfs4_unhash_stid(struct nfs4_stid *s) } /** - * nfs4_get_existing_delegation - Discover if this delegation already exists + * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to * @fp: a pointer to the nfs4_file we're granting a delegation on * * Return: - * On success: NULL if an existing delegation was not found. - * - * On error: -EAGAIN if one was previously granted to this nfs4_client - * for this nfs4_file. - * + * On success: true iff an existing delegation is found */ -static int -nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) +static bool +nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_delegation *searchdp = NULL; struct nfs4_client *searchclp = NULL; @@ -890,10 +935,10 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { searchclp = searchdp->dl_stid.sc_client; if (clp == searchclp) { - return -EAGAIN; + return true; } } - return 0; + return false; } /** @@ -912,16 +957,13 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { - int status; struct nfs4_client *clp = dp->dl_stid.sc_client; lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); - status = nfs4_get_existing_delegation(clp, fp); - if (status) - return status; - ++fp->fi_delegees; + if (nfs4_delegation_exists(clp, fp)) + return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); @@ -957,11 +999,8 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhashed = unhash_delegation_locked(dp); spin_unlock(&state_lock); - if (unhashed) { - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); - } + if (unhashed) + destroy_unhashed_deleg(dp); } static void revoke_delegation(struct nfs4_delegation *dp) @@ -970,17 +1009,14 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - - if (clp->cl_minorversion == 0) - nfs4_put_stid(&dp->dl_stid); - else { + if (clp->cl_minorversion) { dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + refcount_inc(&dp->dl_stid.sc_count); spin_lock(&clp->cl_lock); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); } + destroy_unhashed_deleg(dp); } /* @@ -1765,7 +1801,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) struct nfs4_client *clp; int i; - clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); + clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); @@ -1796,7 +1832,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) err_no_hashtbl: kfree(clp->cl_name.data); err_no_name: - kfree(clp); + kmem_cache_free(client_slab, clp); return NULL; } @@ -1816,7 +1852,7 @@ free_client(struct nfs4_client *clp) kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); - kfree(clp); + kmem_cache_free(client_slab, clp); } /* must be called under the client_lock */ @@ -1866,6 +1902,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp) static void __destroy_client(struct nfs4_client *clp) { + int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -1881,9 +1918,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); + destroy_unhashed_deleg(dp); } while (!list_empty(&clp->cl_revoked)) { dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); @@ -1895,6 +1930,16 @@ __destroy_client(struct nfs4_client *clp) nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } + for (i = 0; i < OWNER_HASH_SIZE; i++) { + struct nfs4_stateowner *so, *tmp; + + list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + /* Should be no openowners at this point */ + WARN_ON_ONCE(so->so_is_open_owner); + remove_blocked_locks(lockowner(so)); + } + } nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -2913,7 +2958,7 @@ out_no_session: static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) { if (!session) - return 0; + return false; return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); } @@ -3431,21 +3476,26 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, void nfsd4_free_slabs(void) { - kmem_cache_destroy(odstate_slab); + kmem_cache_destroy(client_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); kmem_cache_destroy(stateid_slab); kmem_cache_destroy(deleg_slab); + kmem_cache_destroy(odstate_slab); } int nfsd4_init_slabs(void) { + client_slab = kmem_cache_create("nfsd4_clients", + sizeof(struct nfs4_client), 0, 0, NULL); + if (client_slab == NULL) + goto out; openowner_slab = kmem_cache_create("nfsd4_openowners", sizeof(struct nfs4_openowner), 0, 0, NULL); if (openowner_slab == NULL) - goto out; + goto out_free_client_slab; lockowner_slab = kmem_cache_create("nfsd4_lockowners", sizeof(struct nfs4_lockowner), 0, 0, NULL); if (lockowner_slab == NULL) @@ -3478,6 +3528,8 @@ out_free_lockowner_slab: kmem_cache_destroy(lockowner_slab); out_free_openowner_slab: kmem_cache_destroy(openowner_slab); +out_free_client_slab: + kmem_cache_destroy(client_slab); out: dprintk("nfsd4: out of memory while initializing nfsv4\n"); return -ENOMEM; @@ -3905,17 +3957,9 @@ static bool nfsd_break_deleg_cb(struct file_lock *fl) { bool ret = false; - struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; - struct nfs4_delegation *dp; + struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_file *fp = dp->dl_stid.sc_file; - if (!fp) { - WARN(1, "(%p)->fl_owner NULL\n", fl); - return ret; - } - if (fp->fi_had_conflict) { - WARN(1, "duplicate break on %p\n", fp); - return ret; - } /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned @@ -3925,15 +3969,7 @@ nfsd_break_deleg_cb(struct file_lock *fl) spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; - /* - * If there are no delegations on the list, then return true - * so that the lease code will go ahead and delete it. - */ - if (list_empty(&fp->fi_delegations)) - ret = true; - else - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - nfsd_break_one_deleg(dp); + nfsd_break_one_deleg(dp); spin_unlock(&fp->fi_lock); return ret; } @@ -4257,7 +4293,8 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, + int flag) { struct file_lock *fl; @@ -4268,124 +4305,88 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) fl->fl_flags = FL_DELEG; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)fp; + fl->fl_owner = (fl_owner_t)dp; fl->fl_pid = current->tgid; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; return fl; } -/** - * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer - * @dp: a pointer to the nfs4_delegation we're adding. - * - * Return: - * On success: Return code will be 0 on success. - * - * On error: -EAGAIN if there was an existing delegation. - * nonzero if there is an error in other cases. - * - */ - -static int nfs4_setlease(struct nfs4_delegation *dp) -{ - struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file_lock *fl; - struct file *filp; - int status = 0; - - fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ); - if (!fl) - return -ENOMEM; - filp = find_readable_file(fp); - if (!filp) { - /* We should always have a readable file here */ - WARN_ON_ONCE(1); - locks_free_lock(fl); - return -EBADF; - } - fl->fl_file = filp; - status = vfs_setlease(filp, fl->fl_type, &fl, NULL); - if (fl) - locks_free_lock(fl); - if (status) - goto out_fput; - spin_lock(&state_lock); - spin_lock(&fp->fi_lock); - /* Did the lease get broken before we took the lock? */ - status = -EAGAIN; - if (fp->fi_had_conflict) - goto out_unlock; - /* Race breaker */ - if (fp->fi_deleg_file) { - status = hash_delegation_locked(dp, fp); - goto out_unlock; - } - fp->fi_deleg_file = filp; - fp->fi_delegees = 0; - status = hash_delegation_locked(dp, fp); - spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); - if (status) { - /* Should never happen, this is a new fi_deleg_file */ - WARN_ON_ONCE(1); - goto out_fput; - } - return 0; -out_unlock: - spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); -out_fput: - fput(filp); - return status; -} - static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) { - int status; + int status = 0; struct nfs4_delegation *dp; + struct file *filp; + struct file_lock *fl; + /* + * The fi_had_conflict and nfs_get_existing_delegation checks + * here are just optimizations; we'll need to recheck them at + * the end: + */ if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); + filp = find_readable_file(fp); + if (!filp) { + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return ERR_PTR(-EBADF); + } spin_lock(&state_lock); spin_lock(&fp->fi_lock); - status = nfs4_get_existing_delegation(clp, fp); + if (nfs4_delegation_exists(clp, fp)) + status = -EAGAIN; + else if (!fp->fi_deleg_file) { + fp->fi_deleg_file = filp; + /* increment early to prevent fi_deleg_file from being + * cleared */ + fp->fi_delegees = 1; + filp = NULL; + } else + fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - + if (filp) + fput(filp); if (status) return ERR_PTR(status); - dp = alloc_init_deleg(clp, fh, odstate); + status = -ENOMEM; + dp = alloc_init_deleg(clp, fp, fh, odstate); if (!dp) - return ERR_PTR(-ENOMEM); + goto out_delegees; + + fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + if (!fl) + goto out_stid; + + status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + if (fl) + locks_free_lock(fl); + if (status) + goto out_clnt_odstate; - get_nfs4_file(fp); spin_lock(&state_lock); spin_lock(&fp->fi_lock); - dp->dl_stid.sc_file = fp; - if (!fp->fi_deleg_file) { - spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); - status = nfs4_setlease(dp); - goto out; - } - if (fp->fi_had_conflict) { + if (fp->fi_had_conflict) status = -EAGAIN; - goto out_unlock; - } - status = hash_delegation_locked(dp, fp); -out_unlock: + else + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); -out: - if (status) { - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_stid(&dp->dl_stid); - return ERR_PTR(status); - } + + if (status) + destroy_unhashed_deleg(dp); return dp; +out_clnt_odstate: + put_clnt_odstate(dp->dl_clnt_odstate); +out_stid: + nfs4_put_stid(&dp->dl_stid); +out_delegees: + put_deleg_file(fp); + return ERR_PTR(status); } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -5481,15 +5482,26 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; stp->st_stid.sc_type = NFS4_CLOSED_STID; + + /* + * Technically we don't _really_ have to increment or copy it, since + * it should just be gone after this operation and we clobber the + * copied value below, but we continue to do so here just to ensure + * that racing ops see that there was a state change. + */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); - /* See RFC5661 sectionm 18.2.4 */ - if (stp->st_stid.sc_client->cl_minorversion) - memcpy(&close->cl_stateid, &close_stateid, - sizeof(close->cl_stateid)); + /* v4.1+ suggests that we send a special stateid in here, since the + * clients should just ignore this anyway. Since this is not useful + * for v4.0 clients either, we set it to the special close_stateid + * universally. + * + * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5 + */ + memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); @@ -6355,6 +6367,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); + remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); return status; @@ -7140,6 +7153,8 @@ nfs4_state_destroy_net(struct net *net) } } + WARN_ON(!list_empty(&nn->blocked_locks_lru)); + for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->unconf_id_hashtbl[i])) { clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); @@ -7206,7 +7221,6 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -7222,27 +7236,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); - } - - BUG_ON(!list_empty(&reaplist)); - spin_lock(&nn->blocked_locks_lock); - while (!list_empty(&nn->blocked_locks_lru)) { - nbl = list_first_entry(&nn->blocked_locks_lru, - struct nfsd4_blocked_lock, nbl_lru); - list_move(&nbl->nbl_lru, &reaplist); - list_del_init(&nbl->nbl_list); - } - spin_unlock(&nn->blocked_locks_lock); - - while (!list_empty(&reaplist)) { - nbl = list_first_entry(&reaplist, - struct nfsd4_blocked_lock, nbl_lru); - list_del_init(&nbl->nbl_lru); - posix_unblock_lock(&nbl->nbl_lock); - free_blocked_lock(nbl); + destroy_unhashed_deleg(dp); } nfsd4_client_tracking_exit(net); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e502fd16246b..1d048dd95464 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,7 +33,6 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/fs_struct.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -682,7 +681,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl, &create->cr_label, - ¤t->fs->umask); + &create->cr_umask); if (status) goto out; @@ -927,7 +926,6 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: - current->fs->umask = 0; READ_BUF(4); open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { @@ -935,7 +933,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_CREATE_GUARDED: status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl, &open->op_label, - ¤t->fs->umask); + &open->op_umask); if (status) goto out; break; @@ -950,7 +948,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl, &open->op_label, - ¤t->fs->umask); + &open->op_umask); if (status) goto out; break; @@ -1759,7 +1757,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) p = xdr_decode_hyper(p, ©->cp_src_pos); p = xdr_decode_hyper(p, ©->cp_dst_pos); p = xdr_decode_hyper(p, ©->cp_count); - copy->cp_consecutive = be32_to_cpup(p++); + p++; /* ca_consecutive: we always do consecutive copies */ copy->cp_synchronous = be32_to_cpup(p++); tmp = be32_to_cpup(p); /* Source server list not supported */ @@ -3427,8 +3425,9 @@ static __be32 nfsd4_encode_splice_read( return nfserr_resource; len = maxcount; - nfserr = nfsd_splice_read(read->rd_rqstp, file, - read->rd_offset, &maxcount); + nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, + file, read->rd_offset, &maxcount); + read->rd_length = maxcount; if (nfserr) { /* * nfsd_splice_actor may have already messed with the @@ -3511,8 +3510,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, read->rd_vlen = v; len = maxcount; - nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, - read->rd_vlen, &maxcount); + nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, + resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); @@ -4214,7 +4214,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; p = xdr_reserve_space(&resp->xdr, 4 + 4); - *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = xdr_one; /* cr_consecutive */ *p++ = cpu_to_be32(copy->cp_synchronous); return 0; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8aa011820c4a..a008e7634181 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -87,13 +87,23 @@ nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, return nfserr_inval; } +static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags) +{ + if (flags & NFSEXP_INSECURE_PORT) + return true; + /* We don't require gss requests to use low ports: */ + if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS) + return true; + return test_bit(RQ_SECURE, &rqstp->rq_flags); +} + static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, struct svc_export *exp) { int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) { + if (!nfsd_originating_port_ok(rqstp, flags)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("nfsd: request from insecure port %s!\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 43c0419b8ddb..f107f9fa8e15 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -212,13 +212,18 @@ nfsd_proc_write(struct svc_rqst *rqstp) struct nfsd_attrstat *resp = rqstp->rq_resp; __be32 nfserr; unsigned long cnt = argp->len; + unsigned int nvecs; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, - rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC); + nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + if (!nvecs) + return nfserr_io; + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, rqstp->rq_vec, nvecs, + &cnt, NFS_DATA_SYNC); return nfsd_return_attrs(nfserr, resp); } @@ -444,17 +449,19 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) struct svc_fh newfh; __be32 nfserr; + if (argp->tlen > NFS_MAXPATHLEN) + return nfserr_nametoolong; + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + argp->tlen); + if (IS_ERR(argp->tname)) + return nfserrno(PTR_ERR(argp->tname)); + dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, argp->tlen, argp->tname); fh_init(&newfh, NFS_FHSIZE); - /* - * Crazy hack: the request fits in a page, and already-decoded - * attributes follow argp->tname, so it's safe to just write a - * null to ensure it's null-terminated: - */ - argp->tname[argp->tlen] = '\0'; nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &newfh); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79b6064f8977..a43e8260520a 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -71,22 +71,6 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp) } static __be32 * -decode_pathname(__be32 *p, char **namp, unsigned int *lenp) -{ - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0') - return NULL; - } - } - - return p; -} - -static __be32 * decode_sattr(__be32 *p, struct iattr *iap) { u32 tmp, tmp1; @@ -287,7 +271,6 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) struct nfsd_writeargs *args = rqstp->rq_argp; unsigned int len, hdr, dlen; struct kvec *head = rqstp->rq_arg.head; - int v; p = decode_fh(p, &args->fh); if (!p) @@ -323,17 +306,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) if (dlen < XDR_QUADLEN(len)*4) return 0; - rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = head->iov_len - hdr; - v = 0; - while (len > rqstp->rq_vec[v].iov_len) { - len -= rqstp->rq_vec[v].iov_len; - v++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); - rqstp->rq_vec[v].iov_len = PAGE_SIZE; - } - rqstp->rq_vec[v].iov_len = len; - args->vlen = v + 1; + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; return 1; } @@ -394,14 +368,39 @@ int nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_symlinkargs *args = rqstp->rq_argp; + char *base = (char *)p; + size_t xdrlen; if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_pathname(p, &args->tname, &args->tlen))) + || !(p = decode_filename(p, &args->fname, &args->flen))) return 0; - p = decode_sattr(p, &args->attrs); - return xdr_argsize_check(rqstp, p); + args->tlen = ntohl(*p++); + if (args->tlen == 0) + return 0; + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + + /* This request is never larger than a page. Therefore, + * transport will deliver either: + * 1. pathname in the pagelist -> sattr is in the tail. + * 2. everything in the head buffer -> sattr is in the head. + */ + if (rqstp->rq_arg.page_len) { + if (args->tlen != rqstp->rq_arg.page_len) + return 0; + p = rqstp->rq_arg.tail[0].iov_base; + } else { + xdrlen = XDR_QUADLEN(args->tlen); + if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) + return 0; + p += xdrlen; + } + decode_sattr(p, &args->attrs); + + return 1; } int diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 8b2f1d92c579..80933e4334d8 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -11,39 +11,79 @@ #include <linux/tracepoint.h> #include "nfsfh.h" +TRACE_EVENT(nfsd_compound, + TP_PROTO(const struct svc_rqst *rqst, + u32 args_opcnt), + TP_ARGS(rqst, args_opcnt), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, args_opcnt) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->args_opcnt = args_opcnt; + ), + TP_printk("xid=0x%08x opcnt=%u", + __entry->xid, __entry->args_opcnt) +) + +TRACE_EVENT(nfsd_compound_status, + TP_PROTO(u32 args_opcnt, + u32 resp_opcnt, + __be32 status, + const char *name), + TP_ARGS(args_opcnt, resp_opcnt, status, name), + TP_STRUCT__entry( + __field(u32, args_opcnt) + __field(u32, resp_opcnt) + __field(int, status) + __string(name, name) + ), + TP_fast_assign( + __entry->args_opcnt = args_opcnt; + __entry->resp_opcnt = resp_opcnt; + __entry->status = be32_to_cpu(status); + __assign_str(name, name); + ), + TP_printk("op=%u/%u %s status=%d", + __entry->resp_opcnt, __entry->args_opcnt, + __get_str(name), __entry->status) +) + DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, - int len), + unsigned long len), TP_ARGS(rqstp, fhp, offset, len), TP_STRUCT__entry( - __field(__be32, xid) - __field_struct(struct knfsd_fh, fh) + __field(u32, xid) + __field(u32, fh_hash) __field(loff_t, offset) - __field(int, len) + __field(unsigned long, len) ), TP_fast_assign( - __entry->xid = rqstp->rq_xid, - fh_copy_shallow(&__entry->fh, &fhp->fh_handle); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); __entry->offset = offset; __entry->len = len; ), - TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d", - __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu", + __entry->xid, __entry->fh_hash, __entry->offset, __entry->len) ) #define DEFINE_NFSD_IO_EVENT(name) \ -DEFINE_EVENT(nfsd_io_class, name, \ +DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ TP_PROTO(struct svc_rqst *rqstp, \ struct svc_fh *fhp, \ loff_t offset, \ - int len), \ + unsigned long len), \ TP_ARGS(rqstp, fhp, offset, len)) DEFINE_NFSD_IO_EVENT(read_start); -DEFINE_NFSD_IO_EVENT(read_opened); +DEFINE_NFSD_IO_EVENT(read_splice); +DEFINE_NFSD_IO_EVENT(read_vector); DEFINE_NFSD_IO_EVENT(read_io_done); DEFINE_NFSD_IO_EVENT(read_done); DEFINE_NFSD_IO_EVENT(write_start); @@ -51,6 +91,40 @@ DEFINE_NFSD_IO_EVENT(write_opened); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DECLARE_EVENT_CLASS(nfsd_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + loff_t offset, + int status), + TP_ARGS(rqstp, fhp, offset, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(loff_t, offset) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->offset = offset; + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld status=%d", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->status) +) + +#define DEFINE_NFSD_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + loff_t offset, \ + int len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_ERR_EVENT(read_err); +DEFINE_NFSD_ERR_EVENT(write_err); + #include "state.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, @@ -76,7 +150,7 @@ DECLARE_EVENT_CLASS(nfsd_stateid_class, ) #define DEFINE_STATEID_EVENT(name) \ -DEFINE_EVENT(nfsd_stateid_class, name, \ +DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \ TP_PROTO(stateid_t *stp), \ TP_ARGS(stp)) DEFINE_STATEID_EVENT(layoutstate_alloc); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a3c9bfa77def..2410b093a2e6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -881,20 +881,24 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static __be32 -nfsd_finish_read(struct file *file, unsigned long *count, int host_err) +static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + unsigned long *count, int host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; fsnotify_access(file); + trace_nfsd_read_io_done(rqstp, fhp, offset, *count); return 0; - } else + } else { + trace_nfsd_read_err(rqstp, fhp, offset, host_err); return nfserrno(host_err); + } } -__be32 nfsd_splice_read(struct svc_rqst *rqstp, - struct file *file, loff_t offset, unsigned long *count) +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, unsigned long *count) { struct splice_desc sd = { .len = 0, @@ -904,21 +908,23 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, }; int host_err; + trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(file, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } -__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count) +__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, unsigned long *count) { struct iov_iter iter; int host_err; + trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &offset, 0); - - return nfsd_finish_read(file, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } /* @@ -965,13 +971,15 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, { struct svc_export *exp; struct iov_iter iter; - __be32 err = 0; + __be32 nfserr; int host_err; int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; rwf_t flags = 0; + trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* * We want less throttling in balance_dirty_pages() @@ -994,22 +1002,23 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; - *cnt = host_err; - nfsdstats.io_write += host_err; + nfsdstats.io_write += *cnt; fsnotify_modify(file); if (stable && use_wgather) host_err = wait_for_concurrent_writes(file); out_nfserr: - dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) - err = 0; - else - err = nfserrno(host_err); + if (host_err >= 0) { + trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt); + nfserr = nfs_ok; + } else { + trace_nfsd_write_err(rqstp, fhp, offset, host_err); + nfserr = nfserrno(host_err); + } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) current_restore_flags(pflags, PF_LESS_THROTTLE); - return err; + return nfserr; } /* @@ -1024,27 +1033,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; - trace_read_start(rqstp, fhp, offset, vlen); + trace_nfsd_read_start(rqstp, fhp, offset, *count); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; ra = nfsd_init_raparms(file); - trace_read_opened(rqstp, fhp, offset, vlen); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count); else - err = nfsd_readv(file, offset, vec, vlen, count); - - trace_read_io_done(rqstp, fhp, offset, vlen); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); if (ra) nfsd_put_raparams(file, ra); fput(file); - trace_read_done(rqstp, fhp, offset, vlen); + trace_nfsd_read_done(rqstp, fhp, offset, *count); return err; } @@ -1061,18 +1066,16 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct file *file = NULL; __be32 err = 0; - trace_write_start(rqstp, fhp, offset, vlen); + trace_nfsd_write_start(rqstp, fhp, offset, *cnt); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; - trace_write_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - trace_write_io_done(rqstp, fhp, offset, vlen); fput(file); out: - trace_write_done(rqstp, fhp, offset, vlen); + trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index be6d8e00453f..a7e107309f76 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -78,10 +78,13 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); struct raparms; -__be32 nfsd_splice_read(struct svc_rqst *, - struct file *, loff_t, unsigned long *); -__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, - unsigned long *); +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + unsigned long *count); +__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, + unsigned long *count); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 2f4f22e6b8cb..ea7cca3a64b7 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -34,7 +34,7 @@ struct nfsd_writeargs { svc_fh fh; __u32 offset; int len; - int vlen; + struct kvec first; }; struct nfsd_createargs { @@ -72,6 +72,7 @@ struct nfsd_symlinkargs { char * tname; unsigned int tlen; struct iattr attrs; + struct kvec first; }; struct nfsd_readdirargs { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 056bf8a7364e..2cb29e961a76 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -41,7 +41,7 @@ struct nfsd3_writeargs { __u32 count; int stable; __u32 len; - int vlen; + struct kvec first; }; struct nfsd3_createargs { @@ -90,6 +90,7 @@ struct nfsd3_symlinkargs { char * tname; unsigned int tlen; struct iattr attrs; + struct kvec first; }; struct nfsd3_readdirargs { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index bc29511b6405..17c453a7999c 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -110,6 +110,7 @@ struct nfsd4_create { struct { u32 datalen; char *data; + struct kvec first; } link; /* NF4LNK */ struct { u32 specdata1; @@ -118,12 +119,14 @@ struct nfsd4_create { } u; u32 cr_bmval[3]; /* request */ struct iattr cr_iattr; /* request */ + int cr_umask; /* request */ struct nfsd4_change_info cr_cinfo; /* response */ struct nfs4_acl *cr_acl; struct xdr_netobj cr_label; }; #define cr_datalen u.link.datalen #define cr_data u.link.data +#define cr_first u.link.first #define cr_specdata1 u.dev.specdata1 #define cr_specdata2 u.dev.specdata2 @@ -228,6 +231,7 @@ struct nfsd4_open { u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ u32 op_create; /* request */ u32 op_createmode; /* request */ + int op_umask; /* request */ u32 op_bmval[3]; /* request */ struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ nfs4_verifier op_verf __attribute__((aligned(32))); @@ -518,7 +522,6 @@ struct nfsd4_copy { u64 cp_count; /* both */ - bool cp_consecutive; bool cp_synchronous; /* response */ diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 6702a6a0bbb5..d51e1bb781cf 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -139,23 +139,32 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, return false; } -struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, +struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, const struct path *path) { struct fanotify_event_info *event; + gfp_t gfp = GFP_KERNEL; + + /* + * For queues with unlimited length lost events are not expected and + * can possibly have security implications. Avoid losing events when + * memory is short. + */ + if (group->max_events == UINT_MAX) + gfp |= __GFP_NOFAIL; if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; - pevent = kmem_cache_alloc(fanotify_perm_event_cachep, - GFP_KERNEL); + pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) return NULL; event = &pevent->fae; pevent->response = 0; goto init; } - event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) return NULL; init: __maybe_unused @@ -210,10 +219,17 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - event = fanotify_alloc_event(inode, mask, data); + event = fanotify_alloc_event(group, inode, mask, data); ret = -ENOMEM; - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * We don't queue overflow events for permission events as + * there the access is denied and so no event is in fact lost. + */ + if (!fanotify_is_perm_event(mask)) + fsnotify_queue_overflow(group); goto finish; + } fsn_event = &event->fse; ret = fsnotify_add_event(group, fsn_event, fanotify_merge); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 256d9d1ddea9..8609ba06f474 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -52,5 +52,6 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event_info, fse); } -struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, +struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, const struct path *path); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c07eb3d655ea..ec4d8c59d0e3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -757,7 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); - oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); + oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; @@ -820,9 +820,8 @@ out_destroy_group: return fd; } -SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, - __u64, mask, int, dfd, - const char __user *, pathname) +static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, + int dfd, const char __user *pathname) { struct inode *inode = NULL; struct vfsmount *mnt = NULL; @@ -928,13 +927,20 @@ fput_and_out: return ret; } +SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, + __u64, mask, int, dfd, + const char __user *, pathname) +{ + return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE6(fanotify_mark, int, fanotify_fd, unsigned int, flags, __u32, mask0, __u32, mask1, int, dfd, const char __user *, pathname) { - return sys_fanotify_mark(fanotify_fd, flags, + return do_fanotify_mark(fanotify_fd, flags, #ifdef __BIG_ENDIAN ((__u64)mask0 << 32) | mask1, #else diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 8b73332735ba..40dedb37a1f3 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -99,8 +99,14 @@ int inotify_handle_event(struct fsnotify_group *group, fsn_mark); event = kmalloc(alloc_len, GFP_KERNEL); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * Treat lost event due to ENOMEM the same way as queue + * overflow to let userspace know event was lost. + */ + fsnotify_queue_overflow(group); return -ENOMEM; + } fsn_event = &event->fse; fsnotify_init_event(fsn_event, inode, mask); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 2c908b31d6c9..ef32f3657958 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -307,6 +307,20 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; +#ifdef CONFIG_CHECKPOINT_RESTORE + case INOTIFY_IOC_SETNEXTWD: + ret = -EINVAL; + if (arg >= 1 && arg <= INT_MAX) { + struct inotify_group_private_data *data; + + data = &group->inotify_data; + spin_lock(&data->idr_lock); + idr_set_cursor(&data->idr, (unsigned int)arg); + spin_unlock(&data->idr_lock); + ret = 0; + } + break; +#endif /* CONFIG_CHECKPOINT_RESTORE */ } return ret; @@ -635,7 +649,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) /* inotify syscalls */ -SYSCALL_DEFINE1(inotify_init1, int, flags) +static int do_inotify_init(int flags) { struct fsnotify_group *group; int ret; @@ -660,9 +674,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) return ret; } +SYSCALL_DEFINE1(inotify_init1, int, flags) +{ + return do_inotify_init(flags); +} + SYSCALL_DEFINE0(inotify_init) { - return sys_inotify_init1(0); + return do_inotify_init(0); } SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 66f85c651c52..3c3e36745f59 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -111,7 +111,8 @@ int fsnotify_add_event(struct fsnotify_group *group, return 2; } - if (group->q_len >= group->max_events) { + if (event == group->overflow_event || + group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { diff --git a/fs/nsfs.c b/fs/nsfs.c index 36b0772701a0..60702d677bd4 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -184,6 +184,7 @@ int open_related_ns(struct ns_common *ns, return fd; } +EXPORT_SYMBOL_GPL(open_related_ns); static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 2831f495a674..32c523cf5a2d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -381,7 +381,7 @@ unm_err_out: * vfs inode dirty. This ensures that any changes to the mft record are * written out to disk. * - * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) * on the base vfs inode, because even though file data may have been modified, * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a @@ -407,7 +407,7 @@ void __mark_mft_record_dirty(ntfs_inode *ni) else base_ni = ni->ext.base_ntfs_ino; mutex_unlock(&ni->extent_lock); - __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); } static const char *ntfs_please_email = "Please email " diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9a876bb07cac..0f157bbd3e0f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7119,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_commit; did_quota = 1; - data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e8e205bf2e41..302cd7caa4a7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -346,7 +346,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) unlock = 0; out_alloc: - up_read(&OCFS2_I(inode)->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); out_inode_unlock: ocfs2_inode_unlock(inode, 0); out: @@ -2213,7 +2213,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, down_write(&oi->ip_alloc_sem); if (first_get_block) { - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(osb)) ret = ocfs2_zero_tail(inode, di_bh, pos); else ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 8614ff069d99..3494a62ed749 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -78,7 +78,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) /* * Using a named enum representing lock types in terms of #N bit stored in * iocb->private, which is going to be used for communication between - * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + * ocfs2_dio_end_io() and ocfs2_file_write/read_iter(). */ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea8c551bcd7e..91a8889abf9b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -570,7 +570,16 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) break; + if (len != vec_len) { + mlog(ML_ERROR, "Adding page[%d] to bio failed, " + "page %p, len %d, vec_len %u, vec_start %u, " + "bi_sector %llu\n", current_page, page, len, + vec_len, vec_start, + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + bio = ERR_PTR(-EIO); + return bio; + } cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index eac5140aac47..e5076185cc1e 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1819,7 +1819,7 @@ int o2net_register_hb_callbacks(void) static int o2net_accept_one(struct socket *sock, int *more) { - int ret, slen; + int ret; struct sockaddr_in sin; struct socket *new_sock = NULL; struct o2nm_node *node = NULL; @@ -1864,9 +1864,7 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) goto out; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 977763d4c27d..b048d4fa3959 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3072,7 +3072,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We need to return the correct block within the * cluster which should hold our entry. */ - off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + off = ocfs2_dx_dir_hash_idx(osb, &lookup->dl_hinfo); get_bh(dx_leaves[off]); lookup->dl_dx_leaf_bh = dx_leaves[off]; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index fd6bbbbd7d78..39831fc2fd52 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -224,14 +224,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { dlm_astlockfunc_t *fn; - struct dlm_lockstatus *lksb; mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); - lksb = lock->lksb; fn = lock->ast; BUG_ON(lock->ml.node != dlm->node_num); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e9f3705c4c9f..d06e27ec4be4 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -140,6 +140,7 @@ struct dlm_ctxt u8 node_num; u32 key; u8 joining_node; + u8 migrate_done; /* set to 1 means node has migrated all lock resources */ wait_queue_head_t dlm_join_events; unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -960,13 +961,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, void dlm_print_one_lock_resource(struct dlm_lock_resource *res); void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); -u8 dlm_nm_this_node(struct dlm_ctxt *dlm); void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_nm_init(struct dlm_ctxt *dlm); -int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index e1fea149f50b..425081be6161 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -461,6 +461,19 @@ redo_bucket: cond_resched_lock(&dlm->spinlock); num += n; } + + if (!num) { + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "%s: perhaps there are more lock resources " + "need to be migrated after dlm recovery\n", dlm->name); + ret = -EAGAIN; + } else { + mlog(0, "%s: we won't do dlm recovery after migrating " + "all lock resources\n", dlm->name); + dlm->migrate_done = 1; + } + } + spin_unlock(&dlm->spinlock); wake_up(&dlm->dlm_thread_wq); @@ -675,20 +688,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_shutting_down(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; @@ -2052,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; init_waitqueue_head(&dlm->dlm_join_events); + dlm->migrate_done = 0; + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index fd6122a38dbd..8a9281411c18 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,30 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_shutting_down(struct dlm_ctxt *dlm); +static inline int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 66c2a491f68d..74962315794e 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -77,8 +77,7 @@ int dlm_init_lock_cache(void) void dlm_destroy_lock_cache(void) { - if (dlm_lock_cache) - kmem_cache_destroy(dlm_lock_cache); + kmem_cache_destroy(dlm_lock_cache); } /* Tell us whether we can grant a new lock request. diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a7df226f9449..aaca0949fe53 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -414,8 +414,7 @@ int dlm_init_mle_cache(void) void dlm_destroy_mle_cache(void) { - if (dlm_mle_cache) - kmem_cache_destroy(dlm_mle_cache); + kmem_cache_destroy(dlm_mle_cache); } static void dlm_mle_release(struct kref *kref) @@ -472,15 +471,11 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) { - kmem_cache_destroy(dlm_lockname_cache); - dlm_lockname_cache = NULL; - } + kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; - if (dlm_lockres_cache) { - kmem_cache_destroy(dlm_lockres_cache); - dlm_lockres_cache = NULL; - } + kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; } static void dlm_lockres_release(struct kref *kref) @@ -2495,13 +2490,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } /* - * A migrateable resource is one that is : + * A migratable resource is one that is : * 1. locally mastered, and, * 2. zero local locks, and, * 3. one or more non-local locks, or, one or more references * Returns 1 if yes, 0 if not. */ -static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, +static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; @@ -2532,7 +2527,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, continue; } cookie = be64_to_cpu(lock->ml.cookie); - mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on " "%s list\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(cookie), @@ -2548,7 +2543,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; } - mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len, res->lockname.name); return 1; @@ -2792,7 +2787,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&dlm->spinlock); spin_lock(&res->spinlock); - if (dlm_is_lockres_migrateable(dlm, res)) + if (dlm_is_lockres_migratable(dlm, res)) target = dlm_pick_migration_target(dlm, res); spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ec8f75813beb..802636d50365 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -62,7 +62,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node); -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm); static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm) static void dlm_begin_recovery(struct dlm_ctxt *dlm) { - spin_lock(&dlm->spinlock); + assert_spin_locked(&dlm->spinlock); BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", dlm->name, dlm->reco.dead_node); dlm->reco.state |= DLM_RECO_STATE_ACTIVE; - spin_unlock(&dlm->spinlock); } static void dlm_end_recovery(struct dlm_ctxt *dlm) @@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); + if (dlm->migrate_done) { + mlog(0, "%s: no need do recovery after migrating all " + "lock resources\n", dlm->name); + spin_unlock(&dlm->spinlock); + return 0; + } + /* check to see if the new master has died */ if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && test_bit(dlm->reco.new_master, dlm->recovery_map)) { @@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.dead_node); - spin_unlock(&dlm->spinlock); /* take write barrier */ /* (stops the list reshuffling thread, proxy ast handling) */ dlm_begin_recovery(dlm); + spin_unlock(&dlm->spinlock); + if (dlm->reco.new_master == dlm->node_num) goto master_here; @@ -739,7 +746,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } if (destroy) - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return status; } @@ -764,7 +771,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) ndata = kzalloc(sizeof(*ndata), GFP_NOFS); if (!ndata) { - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return -ENOMEM; } ndata->node_num = num; @@ -778,7 +785,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) return 0; } -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm) { struct dlm_reco_node_data *ndata, *next; LIST_HEAD(tmplist); @@ -1378,6 +1385,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, if (!dlm_grab(dlm)) return -EINVAL; + if (!dlm_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not joined! " + "lockres %.*s, master %u\n", + dlm->name, mres->lockname_len, + mres->lockname, mres->master); + dlm_put(dlm); + return -EINVAL; + } + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); real_master = mres->master; @@ -1807,7 +1823,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, int i, j, bad; struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; - unsigned int added = 0; __be64 c; mlog(0, "running %d locks for this lockres\n", mres->num_locks); @@ -1823,7 +1838,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_lockres_set_refmap_bit(dlm, res, from); spin_unlock(&res->spinlock); - added++; break; } BUG_ON(ml->highest_blocked != LKM_IVMODE); @@ -1911,7 +1925,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); - added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -2037,7 +2050,6 @@ skip_lvb: "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); dlm_lockres_set_refmap_bit(dlm, res, ml->node); - added++; } spin_unlock(&res->spinlock); } @@ -2331,13 +2343,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, __dlm_dirty_lockres(dlm, res); } -/* if this node is the recovery master, and there are no - * locks for a given lockres owned by this node that are in - * either PR or EX mode, zero out the lvb before requesting. - * - */ - - static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { struct dlm_lock_resource *res; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 9479f99c2145..97a972efab83 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1756,8 +1756,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) level = write ? DLM_LOCK_EX : DLM_LOCK_PR; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, - 0); + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) mlog_errno(status); @@ -1796,7 +1795,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write) write ? "EXMODE" : "PRMODE"); if (!ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* @@ -1816,8 +1815,7 @@ int ocfs2_open_lock(struct inode *inode) lockres = &OCFS2_I(inode)->ip_open_lockres; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR, 0, 0); + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0); if (status < 0) mlog_errno(status); @@ -1854,8 +1852,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write) * other nodes and the -EAGAIN will indicate to the caller that * this inode is still in use. */ - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - level, DLM_LKF_NOQUEUE, 0); + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); out: return status; @@ -1876,11 +1873,9 @@ void ocfs2_open_unlock(struct inode *inode) goto out; if(lockres->l_ro_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR); if(lockres->l_ex_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_EX); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); out: return; @@ -2601,9 +2596,9 @@ void ocfs2_inode_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* @@ -3537,7 +3532,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that * we can recover correctly from node failure. Otherwise, we may get - * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. */ if (!ocfs2_is_o2cb_active() && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5d1784a365a3..6ee94bc23f5b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -101,7 +101,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); trace_ocfs2_file_open(inode, file, file->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, mode); @@ -116,7 +116,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) /* Check that the inode hasn't been wiped from disk by another * node. If it hasn't then we're safe as long as we hold the * spin lock until our increment of open count. */ - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + if (oi->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&oi->ip_lock); status = -ENOENT; @@ -190,7 +190,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, - OCFS2_I(inode)->ip_blkno, + oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, (unsigned long long)datasync); @@ -296,7 +296,7 @@ int ocfs2_update_inode_atime(struct inode *inode, ocfs2_journal_dirty(handle, bh); out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + ocfs2_commit_trans(osb, handle); out: return ret; } @@ -2257,7 +2257,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, @@ -2405,7 +2405,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, filp->f_path.dentry->d_name.len, filp->f_path.dentry->d_name.name, @@ -2448,7 +2448,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_aio_read() a chance of actually working. + * generic_file_read_iter() a chance of actually working. */ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, !nowait); @@ -2460,7 +2460,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, ocfs2_inode_unlock(inode, lock_level); ret = generic_file_read_iter(iocb, to); - trace_generic_file_aio_read_ret(ret); + trace_generic_file_read_iter_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index e87279e49ba3..f65f2b2f594d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = { "UNSUPPORTED" }; -static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); -static LIST_HEAD(ocfs2_filecheck_sysfs_list); - -struct ocfs2_filecheck { - struct list_head fc_head; /* File check entry list head */ - spinlock_t fc_lock; - unsigned int fc_max; /* Maximum number of entry in list */ - unsigned int fc_size; /* Current entry count in list */ - unsigned int fc_done; /* Finished entry count in list */ -}; - -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - -#define OCFS2_FILECHECK_MAXSIZE 100 -#define OCFS2_FILECHECK_MINSIZE 10 - -/* File check operation type */ -enum { - OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ - OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ - OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ -}; - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; @@ -110,34 +80,84 @@ ocfs2_filecheck_error(int errno) return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf); -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count); -static struct kobj_attribute ocfs2_attr_filecheck_chk = +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = __ATTR(check, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_fix = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = __ATTR(fix, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_set = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = __ATTR(set, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; + +static void ocfs2_filecheck_release(struct kobject *kobj) +{ + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); +} + +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; +} + +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_attrs = ocfs2_filecheck_attrs, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) - wait_on_atomic_t(&entry->fs_count, atomic_t_wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { p = list_first_entry(&entry->fs_fcheck->fc_head, @@ -148,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) } spin_unlock(&entry->fs_fcheck->fc_lock); - kset_unregister(entry->fs_fcheckkset); - kset_unregister(entry->fs_devicekset); kfree(entry->fs_fcheck); - kfree(entry); -} - -static void -ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) -{ - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); + entry->fs_fcheck = NULL; } -static int ocfs2_filecheck_sysfs_del(const char *devname) +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) { - struct ocfs2_filecheck_sysfs_entry *p; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - list_del(&p->fs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - ocfs2_filecheck_sysfs_free(p); - return 0; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return 1; -} - -static void -ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) -{ - if (atomic_dec_and_test(&entry->fs_count)) - wake_up_atomic_t(&entry->fs_count); -} - -static struct ocfs2_filecheck_sysfs_entry * -ocfs2_filecheck_sysfs_get(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p = NULL; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - atomic_inc(&p->fs_count); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return p; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return NULL; -} - -int ocfs2_filecheck_create_sysfs(struct super_block *sb) -{ - int ret = 0; - struct kset *device_kset = NULL; - struct kset *fcheck_kset = NULL; - struct ocfs2_filecheck *fcheck = NULL; - struct ocfs2_filecheck_sysfs_entry *entry = NULL; - struct attribute **attrs = NULL; - struct attribute_group attrgp; - - if (!ocfs2_kset) - return -ENOMEM; - - attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); - if (!attrs) { - ret = -ENOMEM; - goto error; - } else { - attrs[0] = &ocfs2_attr_filecheck_chk.attr; - attrs[1] = &ocfs2_attr_filecheck_fix.attr; - attrs[2] = &ocfs2_attr_filecheck_set.attr; - attrs[3] = NULL; - memset(&attrgp, 0, sizeof(attrgp)); - attrgp.attrs = attrs; - } + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); - if (!fcheck) { - ret = -ENOMEM; - goto error; - } else { - INIT_LIST_HEAD(&fcheck->fc_head); - spin_lock_init(&fcheck->fc_lock); - fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; - fcheck->fc_size = 0; - fcheck->fc_done = 0; - } - - if (strlen(sb->s_id) <= 0) { - mlog(ML_ERROR, - "Cannot get device basename when create filecheck sysfs\n"); - ret = -ENODEV; - goto error; - } - - device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); - if (!device_kset) { - ret = -ENOMEM; - goto error; - } + if (!fcheck) + return -ENOMEM; - fcheck_kset = kset_create_and_add("filecheck", NULL, - &device_kset->kobj); - if (!fcheck_kset) { - ret = -ENOMEM; - goto error; + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kfree(fcheck); + return ret; } - ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); - if (ret) - goto error; - - entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); - if (!entry) { - ret = -ENOMEM; - goto error; - } else { - atomic_set(&entry->fs_count, 1); - entry->fs_sb = sb; - entry->fs_devicekset = device_kset; - entry->fs_fcheckkset = fcheck_kset; - entry->fs_fcheck = fcheck; - ocfs2_filecheck_sysfs_add(entry); - } - - kfree(attrs); + entry->fs_fcheck = fcheck; return 0; - -error: - kfree(attrs); - kfree(entry); - kfree(fcheck); - kset_unregister(fcheck_kset); - kset_unregister(device_kset); - return ret; } -int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) { - return ocfs2_filecheck_sysfs_del(sb->s_id); + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); } static int @@ -309,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, spin_lock(&ent->fs_fcheck->fc_lock); if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot set online file check maximum entry number " "to %u due to too many pending entries(%u)\n", len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); @@ -386,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, return 0; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -394,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, ssize_t ret = 0, total = 0, remain = PAGE_SIZE; unsigned int type; struct ocfs2_filecheck_entry *p; - struct ocfs2_filecheck_sysfs_entry *ent; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) return -EINVAL; - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->name); - return -ENODEV; - } - if (type == OCFS2_FILECHECK_TYPE_SET) { spin_lock(&ent->fs_fcheck->fc_lock); total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); @@ -440,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, spin_unlock(&ent->fs_fcheck->fc_lock); exit: - ocfs2_filecheck_sysfs_put(ent); return total; } -static int +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + +static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { struct ocfs2_filecheck_entry *p; @@ -483,21 +408,21 @@ static void ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { - entry->fe_done = 1; spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; ent->fs_fcheck->fc_done++; spin_unlock(&ent->fs_fcheck->fc_lock); } static unsigned int -ocfs2_filecheck_handle(struct super_block *sb, +ocfs2_filecheck_handle(struct ocfs2_super *osb, unsigned long ino, unsigned int flags) { unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; struct inode *inode = NULL; int rc; - inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + inode = ocfs2_iget(osb, ino, flags, 0); if (IS_ERR(inode)) { rc = (int)(-(long)inode); if (rc >= OCFS2_FILECHECK_ERR_START && @@ -515,11 +440,14 @@ static void ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); else entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; @@ -527,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, ocfs2_filecheck_done_entry(ent, entry); } -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + ssize_t ret = 0; struct ocfs2_filecheck_args args; struct ocfs2_filecheck_entry *entry; - struct ocfs2_filecheck_sysfs_entry *ent; - ssize_t ret = 0; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (count == 0) return count; - if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { - mlog(ML_ERROR, "Invalid arguments for online file check\n"); + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) return -EINVAL; - } - - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->parent->name); - return -ENODEV; - } if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); @@ -564,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, } spin_lock(&ent->fs_fcheck->fc_lock); - if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && - (ent->fs_fcheck->fc_done == 0)) { - mlog(ML_ERROR, + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", ent->fs_fcheck->fc_max); - ret = -EBUSY; + ret = -EAGAIN; kfree(entry); } else { if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && @@ -595,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - ocfs2_filecheck_sysfs_put(ent); return (!ret ? count : ret); } diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index e5cd002a2c09..6a22ee79e8d0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -43,7 +43,32 @@ enum { #define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED #define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED -int ocfs2_filecheck_create_sysfs(struct super_block *sb); -int ocfs2_filecheck_remove_sysfs(struct super_block *sb); +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); #endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d51b80edd972..ddc3e9470c87 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1135,7 +1135,7 @@ static void ocfs2_clear_inode(struct inode *inode) trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); - mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + mlog_bug_on_msg(osb == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); @@ -1150,7 +1150,7 @@ static void ocfs2_clear_inode(struct inode *inode) ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); - ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + ocfs2_resv_discard(&osb->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); @@ -1160,7 +1160,7 @@ static void ocfs2_clear_inode(struct inode *inode) * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ - if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + if (!(oi->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), @@ -1223,7 +1223,7 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ - jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c801eddc4bf3..8dd6f703c819 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -525,7 +525,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, * these are used by the support functions here and in * callers. */ inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); - OCFS2_I(inode)->ip_blkno = fe_blkno; + oi->ip_blkno = fe_blkno; spin_lock(&osb->osb_lock); inode->i_generation = osb->s_next_generation++; spin_unlock(&osb->osb_lock); @@ -1186,8 +1186,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); bail: if (status) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6867eef2e06b..4f86ac0027b5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -50,6 +50,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -472,6 +474,12 @@ struct ocfs2_super * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index e2a11aaece10..2ee76a90ba8f 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1311,11 +1311,11 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); @@ -1467,7 +1467,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write, __entry->saved_pos, __entry->count, __entry->wait) ); -DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); +DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret); /* End of trace events for fs/ocfs2/file.c. */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ab156e35ec00..01c6b3894406 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -573,7 +573,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)oi->ip_blkno); ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { @@ -3359,7 +3359,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) unsigned int ext_flags; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + if (!ocfs2_refcount_tree(osb)) { return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } @@ -3707,7 +3707,7 @@ int ocfs2_add_refcount_flag(struct inode *inode, trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { - ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ret = ocfs2_reserve_new_metadata_blocks(osb, ref_blocks, &meta_ac); if (ret) { mlog_errno(ret); @@ -4766,8 +4766,8 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, *bh2 = *bh1; trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); return 0; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index dae9eb7c441e..d2fb97b173da 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -398,7 +398,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) - { +{ long major, minor; char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d8f5f6ce99dc..f7c972fbed6a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -79,8 +79,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, @@ -387,7 +385,7 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); - bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_generation = cpu_to_le32(osb->fs_generation); bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); @@ -1521,7 +1519,7 @@ static int ocfs2_cluster_group_search(struct inode *inode, OCFS2_I(inode)->ip_clusters, max_bits); } - ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + ret = ocfs2_block_group_find_clear_bits(osb, group_bh, bits_wanted, max_bits, res); if (ret) @@ -2626,53 +2624,6 @@ int ocfs2_release_clusters(handle_t *handle, _ocfs2_clear_bit); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) -{ - printk("Block Group:\n"); - printk("bg_signature: %s\n", bg->bg_signature); - printk("bg_size: %u\n", bg->bg_size); - printk("bg_bits: %u\n", bg->bg_bits); - printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); - printk("bg_chain: %u\n", bg->bg_chain); - printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); - printk("bg_next_group: %llu\n", - (unsigned long long)bg->bg_next_group); - printk("bg_parent_dinode: %llu\n", - (unsigned long long)bg->bg_parent_dinode); - printk("bg_blkno: %llu\n", - (unsigned long long)bg->bg_blkno); -} - -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) -{ - int i; - - printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); - printk("i_signature: %s\n", fe->i_signature); - printk("i_size: %llu\n", - (unsigned long long)fe->i_size); - printk("i_clusters: %u\n", fe->i_clusters); - printk("i_generation: %u\n", - le32_to_cpu(fe->i_generation)); - printk("id1.bitmap1.i_used: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_used)); - printk("id1.bitmap1.i_total: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_total)); - printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); - printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); - printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); - printk("id2.i_chain.cl_next_free_rec: %u\n", - fe->id2.i_chain.cl_next_free_rec); - for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { - printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_free); - printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_total); - printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, - (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); - } -} - /* * For a given allocation, determine which allocators will need to be * accessed, and lock them, reserving the appropriate number of bits. diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ffa4952d432b..3415e0b09398 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -423,10 +423,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { if (wait) - jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + jbd2_log_wait_commit(osb->journal->j_journal, target); } return 0; @@ -1161,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto read_super_error; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2/<devname>/filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto read_super_error; + } + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else @@ -1199,9 +1216,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ - ocfs2_filecheck_create_sysfs(sb); - return status; read_super_error: @@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1768,12 +1781,9 @@ static int ocfs2_initialize_mem_caches(void) NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; } @@ -1787,16 +1797,13 @@ static void ocfs2_free_mem_caches(void) * destroy cache. */ rcu_barrier(); - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); ocfs2_inode_cachep = NULL; - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); ocfs2_dquot_cachep = NULL; - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); ocfs2_qf_chunk_cachep = NULL; } @@ -1899,6 +1906,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + /* Remove file check sysfs related directores/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); + + kset_unregister(osb->osb_dev_kset); + debugfs_remove(osb->osb_ctxt); /* Orphan scan should be stopped as early as possible */ diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 82e17b076ce7..78f09c76ab3c 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -633,6 +633,5 @@ int __init init_ocfs2_uptodate_cache(void) void exit_ocfs2_uptodate_cache(void) { - if (ocfs2_uptodate_cachep) - kmem_cache_destroy(ocfs2_uptodate_cachep); + kmem_cache_destroy(ocfs2_uptodate_cachep); } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c261c1dfd374..3a24ce3deb01 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3564,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode, .not_found = -ENODATA, }; - if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + if (!ocfs2_supports_xattr(osb)) return -EOPNOTSUPP; /* diff --git a/fs/open.c b/fs/open.c index 7ea118471dce..c5ee7cd60424 100644 --- a/fs/open.c +++ b/fs/open.c @@ -128,7 +128,7 @@ out: } EXPORT_SYMBOL_GPL(vfs_truncate); -static long do_sys_truncate(const char __user *pathname, loff_t length) +long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -162,7 +162,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; @@ -333,7 +333,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } EXPORT_SYMBOL_GPL(vfs_fallocate); -SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) +int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { struct fd f = fdget(fd); int error = -EBADF; @@ -345,12 +345,17 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return error; } +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) +{ + return ksys_fallocate(fd, mode, offset, len); +} + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +long do_faccessat(int dfd, const char __user *filename, int mode) { const struct cred *old_cred; struct cred *override_cred; @@ -426,12 +431,17 @@ out: return res; } +SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +{ + return do_faccessat(dfd, filename, mode); +} + SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return sys_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode); } -SYSCALL_DEFINE1(chdir, const char __user *, filename) +int ksys_chdir(const char __user *filename) { struct path path; int error; @@ -457,6 +467,11 @@ out: return error; } +SYSCALL_DEFINE1(chdir, const char __user *, filename) +{ + return ksys_chdir(filename); +} + SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); @@ -479,7 +494,7 @@ out: return error; } -SYSCALL_DEFINE1(chroot, const char __user *, filename) +int ksys_chroot(const char __user *filename) { struct path path; int error; @@ -512,6 +527,11 @@ out: return error; } +SYSCALL_DEFINE1(chroot, const char __user *, filename) +{ + return ksys_chroot(filename); +} + static int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; @@ -541,7 +561,7 @@ out_unlock: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) +int ksys_fchmod(unsigned int fd, umode_t mode) { struct fd f = fdget(fd); int err = -EBADF; @@ -554,7 +574,12 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) return err; } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) +SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) +{ + return ksys_fchmod(fd, mode); +} + +int do_fchmodat(int dfd, const char __user *filename, umode_t mode) { struct path path; int error; @@ -572,9 +597,15 @@ retry: return error; } +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, + umode_t, mode) +{ + return do_fchmodat(dfd, filename, mode); +} + SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return sys_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode); } static int chown_common(const struct path *path, uid_t user, gid_t group) @@ -619,8 +650,8 @@ retry_deleg: return error; } -SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, - gid_t, group, int, flag) +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag) { struct path path; int error = -EINVAL; @@ -651,18 +682,24 @@ out: return error; } +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) +{ + return do_fchownat(dfd, filename, user, group, flag); +} + SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, 0); + return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, - AT_SYMLINK_NOFOLLOW); + return do_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); } -SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; @@ -682,14 +719,9 @@ out: return error; } -int open_check_o_direct(struct file *f) +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } - return 0; + return ksys_fchown(fd, user, group); } static int do_dentry_open(struct file *f, @@ -713,7 +745,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; - return 0; + goto done; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { @@ -766,7 +798,12 @@ static int do_dentry_open(struct file *f, f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - +done: + /* NB: we're sure to have correct a_ops only after f_op->open */ + error = -EINVAL; + if ((f->f_flags & O_DIRECT) && + (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)) + goto out_fput; return 0; cleanup_all: @@ -781,6 +818,9 @@ cleanup_file: f->f_path.dentry = NULL; f->f_inode = NULL; return error; +out_fput: + fput(f); + return error; } /** @@ -878,20 +918,14 @@ struct file *dentry_open(const struct path *path, int flags, BUG_ON(!path->mnt); f = get_empty_filp(); - if (!IS_ERR(f)) { - f->f_flags = flags; - error = vfs_open(path, f, cred); - if (!error) { - /* from now on we need fput() to dispose of f */ - error = open_check_o_direct(f); - if (error) { - fput(f); - f = ERR_PTR(error); - } - } else { - put_filp(f); - f = ERR_PTR(error); - } + if (IS_ERR(f)) + return f; + + f->f_flags = flags; + error = vfs_open(path, f, cred); + if (error) { + put_filp(f); + return ERR_PTR(error); } return f; } @@ -1114,7 +1148,7 @@ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, fla */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { - return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); + return ksys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); } #endif @@ -1163,7 +1197,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) return retval; } -EXPORT_SYMBOL(sys_close); /* * This routine simulates a hangup on the tty, to arrange that users diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 480ea059a680..10587413b20e 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -9,7 +9,6 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/posix_acl_xattr.h> -#include <linux/fs_struct.h> struct posix_acl *orangefs_get_acl(struct inode *inode, int type) { diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index ea6256d136d1..00fadaf0da8f 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -500,7 +500,7 @@ int orangefs_normalize_to_errno(__s32 error_code) * server. */ } else if (error_code > 0) { - gossip_err("orangefs: error status receieved.\n"); + gossip_err("orangefs: error status received.\n"); gossip_err("orangefs: assuming error code is inverted.\n"); error_code = -error_code; } diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 406e72de88f6..ce6ff5a0a6e4 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,8 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. + If unsure, say N. + config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW bool "Overlayfs: follow redirects even if redirects are turned off" default y @@ -32,8 +34,13 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW Disable this to get a possibly more secure configuration, but that might not be backward compatible with previous kernels. + If backward compatibility is not an issue, then it is safe and + recommended to say N here. + For more information, see Documentation/filesystems/overlayfs.txt + If unsure, say Y. + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS @@ -51,6 +58,8 @@ config OVERLAY_FS_INDEX That is, mounting an overlay which has an inodes index on a kernel that doesn't support this feature will have unexpected results. + If unsure, say N. + config OVERLAY_FS_NFS_EXPORT bool "Overlayfs: turn on NFS export feature by default" depends on OVERLAY_FS @@ -72,3 +81,8 @@ config OVERLAY_FS_NFS_EXPORT Note, that the NFS export feature is not backward compatible. That is, mounting an overlay which has a full index on a kernel that doesn't support this feature will have unexpected results. + + Most users should say N here and enable this feature on a case-by- + case basis with the "nfs_export=on" mount option. + + Say N unless you fully understand the consequences. diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index bb94ce9da5c8..87bd4148f4fb 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -19,6 +19,142 @@ #include <linux/ratelimit.h> #include "overlayfs.h" +static int ovl_encode_maybe_copy_up(struct dentry *dentry) +{ + int err; + + if (ovl_dentry_upper(dentry)) + return 0; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } + + if (err) { + pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + dentry, err); + } + + return err; +} + +/* + * Before encoding a non-upper directory file handle from real layer N, we need + * to check if it will be possible to reconnect an overlay dentry from the real + * lower decoded dentry. This is done by following the overlay ancestry up to a + * "layer N connected" ancestor and verifying that all parents along the way are + * "layer N connectable". If an ancestor that is NOT "layer N connectable" is + * found, we need to copy up an ancestor, which is "layer N connectable", thus + * making that ancestor "layer N connected". For example: + * + * layer 1: /a + * layer 2: /a/b/c + * + * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is + * copied up and renamed, upper dir /a will be indexed by lower dir /a from + * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) + * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay + * dentry from the connected lower dentry /a/b/c. + * + * To avoid this problem on decode time, we need to copy up an ancestor of + * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is + * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" + * and when the time comes to decode the file handle from lower dentry /a/b/c, + * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding + * a connected overlay dentry will be accomplished. + * + * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an + * entry /a in the lower layers above layer N and find the indexed dir /a from + * layer 1. If that improvement is made, then the check for "layer N connected" + * will need to verify there are no redirects in lower layers above N. In the + * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a + * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": + * + * layer 1: /A (redirect = /a) + * layer 2: /a/b/c + */ + +/* Return the lowest layer for encoding a connectable file handle */ +static int ovl_connectable_layer(struct dentry *dentry) +{ + struct ovl_entry *oe = OVL_E(dentry); + + /* We can get overlay root from root of any layer */ + if (dentry == dentry->d_sb->s_root) + return oe->numlower; + + /* + * If it's an unindexed merge dir, then it's not connectable with any + * lower layer + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* We can get upper/overlay path from indexed/lower dentry */ + return oe->lowerstack[0].layer->idx; +} + +/* + * @dentry is "connected" if all ancestors up to root or a "connected" ancestor + * have the same uppermost lower layer as the origin's layer. We may need to + * copy up a "connectable" ancestor to make it "connected". A "connected" dentry + * cannot become non "connected", so cache positive result in dentry flags. + * + * Return the connected origin layer or < 0 on error. + */ +static int ovl_connect_layer(struct dentry *dentry) +{ + struct dentry *next, *parent = NULL; + int origin_layer; + int err = 0; + + if (WARN_ON(dentry == dentry->d_sb->s_root) || + WARN_ON(!ovl_dentry_lower(dentry))) + return -EIO; + + origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx; + if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) + return origin_layer; + + /* Find the topmost origin layer connectable ancestor of @dentry */ + next = dget(dentry); + for (;;) { + parent = dget_parent(next); + if (WARN_ON(parent == next)) { + err = -EIO; + break; + } + + /* + * If @parent is not origin layer connectable, then copy up + * @next which is origin layer connectable and we are done. + */ + if (ovl_connectable_layer(parent) < origin_layer) { + err = ovl_encode_maybe_copy_up(next); + break; + } + + /* If @parent is connected or indexed we are done */ + if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || + ovl_test_flag(OVL_INDEX, d_inode(parent))) + break; + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + if (!err) + ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); + + return err ?: origin_layer; +} + /* * We only need to encode origin if there is a chance that the same object was * encoded pre copy up and then we need to stay consistent with the same @@ -41,73 +177,59 @@ * L = lower file handle * * (*) Connecting an overlay dir from real lower dentry is not always - * possible when there are redirects in lower layers. To mitigate this case, - * we copy up the lower dir first and then encode an upper dir file handle. + * possible when there are redirects in lower layers and non-indexed merge dirs. + * To mitigate those case, we may copy up the lower dir ancestor before encode + * a lower dir file handle. + * + * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static bool ovl_should_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct dentry *dentry) { struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) - return false; + return 0; /* - * Decoding a merge dir, whose origin's parent is under a redirected - * lower dir is not always possible. As a simple aproximation, we do - * not encode lower dir file handles when overlay has multiple lower - * layers and origin is below the topmost lower layer. + * Upper file handle for non-indexed upper. * - * TODO: copy up only the parent that is under redirected lower. + * Root is never indexed, so if there's an upper layer, encode upper for + * root. */ - if (d_is_dir(dentry) && ofs->upper_mnt && - OVL_E(dentry)->lowerstack[0].layer->idx > 1) - return false; - - /* Decoding a non-indexed upper from origin is not implemented */ if (ovl_dentry_upper(dentry) && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) - return false; - - return true; -} - -static int ovl_encode_maybe_copy_up(struct dentry *dentry) -{ - int err; - - if (ovl_dentry_upper(dentry)) return 0; - err = ovl_want_write(dentry); - if (err) - return err; - - err = ovl_copy_up(dentry); + /* + * Decoding a merge dir, whose origin's ancestor is under a redirected + * lower dir or under a non-indexed upper is not always possible. + * ovl_connect_layer() will try to make origin's layer "connected" by + * copying up a "connectable" ancestor. + */ + if (d_is_dir(dentry) && ofs->upper_mnt) + return ovl_connect_layer(dentry); - ovl_drop_write(dentry); - return err; + /* Lower file handle for indexed and non-upper dir/non-dir */ + return 1; } static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) { - struct dentry *origin = ovl_dentry_lower(dentry); struct ovl_fh *fh = NULL; - int err; + int err, enc_lower; /* - * If we should not encode a lower dir file handle, copy up and encode - * an upper dir file handle. + * Check if we should encode a lower or upper file handle and maybe + * copy up an ancestor to make lower file handle connectable. */ - if (!ovl_should_encode_origin(dentry)) { - err = ovl_encode_maybe_copy_up(dentry); - if (err) - goto fail; - - origin = NULL; - } + err = enc_lower = ovl_check_encode_origin(dentry); + if (enc_lower < 0) + goto fail; - /* Encode an upper or origin file handle */ - fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin); + /* Encode an upper or lower file handle */ + fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -355,8 +477,8 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, dput(upper); } - if (!this) - return NULL; + if (IS_ERR_OR_NULL(this)) + return this; if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) { dput(this); @@ -498,7 +620,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb, if (err == -ECHILD) { this = ovl_lookup_real_ancestor(sb, real, layer); - err = IS_ERR(this) ? PTR_ERR(this) : 0; + err = PTR_ERR_OR_ZERO(this); } if (!err) { dput(connected); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index fcd97b783fa1..3b1bd469accd 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, return inode; } +/* + * Does overlay inode need to be hashed by lower inode? + */ +static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, + struct dentry *lower, struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + /* No, if pure upper */ + if (!lower) + return false; + + /* Yes, if already indexed */ + if (index) + return true; + + /* Yes, if won't be copied up */ + if (!ofs->upper_mnt) + return true; + + /* No, if lower hardlink is or will be broken on copy up */ + if ((upper || !ovl_indexdir(sb)) && + !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return false; + + /* No, if non-indexed upper with NFS export */ + if (sb->s_export_op && upper) + return false; + + /* Otherwise, hash by lower inode for fsnotify */ + return true; +} + struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, struct dentry *lowerdentry, struct dentry *index, unsigned int numlower) { - struct ovl_fs *ofs = sb->s_fs_info; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; - /* Already indexed or could be indexed on copy up? */ - bool indexed = (index || (ovl_indexdir(sb) && !upperdentry)); - struct dentry *origin = indexed ? lowerdentry : NULL; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); bool is_dir; - if (WARN_ON(upperdentry && indexed && !lowerdentry)) - return ERR_PTR(-EIO); - if (!realinode) realinode = d_inode(lowerdentry); /* - * Copy up origin (lower) may exist for non-indexed non-dir upper, but - * we must not use lower as hash key in that case. - * Hash non-dir that is or could be indexed by origin inode. - * Hash dir that is or could be merged by origin inode. - * Hash pure upper and non-indexed non-dir by upper inode. - * Hash non-indexed dir by upper inode for NFS export. + * Copy up origin (lower) may exist for non-indexed upper, but we must + * not use lower as hash key if this is a broken hardlink. */ is_dir = S_ISDIR(realinode->i_mode); - if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt)) - origin = lowerdentry; - - if (upperdentry || origin) { - struct inode *key = d_inode(origin ?: upperdentry); + if (upperdentry || bylower) { + struct inode *key = d_inode(bylower ? lowerdentry : + upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; inode = iget5_locked(sb, (unsigned long) key, @@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); } else { + /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) goto out_nomem; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index de3e6da1d5a5..70fcfcc684cc 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -913,9 +913,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, stack[ctr].layer = lower.layer; ctr++; - if (d.stop) - break; - /* * Following redirects can have security consequences: it's like * a symlink into the lower layer without the permission checks. @@ -933,6 +930,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; } + if (d.stop) + break; + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0df25a9c94bd..225ff1171147 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -40,6 +40,7 @@ enum ovl_inode_flag { enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, + OVL_E_CONNECTED, }; /* diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9ee37c76091d..7c24619ae7fc 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1359,6 +1359,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, ovl_dentry_lower(root_dentry)); diff --git a/fs/pipe.c b/fs/pipe.c index 7b1954caf388..39d6f431da83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -841,7 +841,7 @@ int do_pipe_flags(int *fd, int flags) * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) +static int do_pipe2(int __user *fildes, int flags) { struct file *files[2]; int fd[2]; @@ -863,9 +863,14 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) return error; } +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) +{ + return do_pipe2(fildes, flags); +} + SYSCALL_DEFINE1(pipe, int __user *, fildes) { - return sys_pipe2(fildes, 0); + return do_pipe2(fildes, 0); } static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9298324325ed..d53246863cfb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,9 +94,6 @@ #include <linux/sched/stat.h> #include <linux/flex_array.h> #include <linux/posix-timers.h> -#ifdef CONFIG_HARDWALL -#include <asm/hardwall.h> -#endif #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3002,9 +2999,6 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif -#ifdef CONFIG_HARDWALL - ONE("hardwall", S_IRUGO, proc_pid_hardwall), -#endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), @@ -3393,9 +3387,6 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif -#ifdef CONFIG_HARDWALL - ONE("hardwall", S_IRUGO, proc_pid_hardwall), -#endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), diff --git a/fs/quota/compat.c b/fs/quota/compat.c index 779caed4f078..c30572857619 100644 --- a/fs/quota/compat.c +++ b/fs/quota/compat.c @@ -41,8 +41,9 @@ struct compat_fs_quota_stat { __u16 qs_iwarnlimit; }; -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) +COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd, + const char __user *, special, qid_t, id, + void __user *, addr) { unsigned int cmds; struct if_dqblk __user *dqblk; @@ -59,7 +60,7 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, case Q_GETQUOTA: dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); compat_dqblk = addr; - ret = sys_quotactl(cmd, special, id, dqblk); + ret = kernel_quotactl(cmd, special, id, dqblk); if (ret) break; if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || @@ -75,12 +76,12 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, get_user(data, &compat_dqblk->dqb_valid) || put_user(data, &dqblk->dqb_valid)) break; - ret = sys_quotactl(cmd, special, id, dqblk); + ret = kernel_quotactl(cmd, special, id, dqblk); break; case Q_XGETQSTAT: fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); compat_fsqstat = addr; - ret = sys_quotactl(cmd, special, id, fsqstat); + ret = kernel_quotactl(cmd, special, id, fsqstat); if (ret) break; ret = -EFAULT; @@ -113,7 +114,7 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, ret = 0; break; default: - ret = sys_quotactl(cmd, special, id, addr); + ret = kernel_quotactl(cmd, special, id, addr); } return ret; } diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 43612e2a73af..860bfbe7a07a 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -833,8 +833,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ -SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, - qid_t, id, void __user *, addr) +int kernel_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) { uint cmds, type; struct super_block *sb = NULL; @@ -885,3 +885,9 @@ out: path_put(pathp); return ret; } + +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) +{ + return kernel_quotactl(cmd, special, id, addr); +} diff --git a/fs/read_write.c b/fs/read_write.c index f8547b82dfb3..c4eabbfc90df 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -301,7 +301,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); -SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) +off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); @@ -319,10 +319,15 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) return retval; } +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) +{ + return ksys_lseek(fd, offset, whence); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { - return sys_lseek(fd, offset, whence); + return ksys_lseek(fd, offset, whence); } #endif @@ -563,7 +568,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } -SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) +ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -578,8 +583,12 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) return ret; } -SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, - size_t, count) +SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) +{ + return ksys_read(fd, buf, count); +} + +ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -595,8 +604,14 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, return ret; } -SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, - size_t, count, loff_t, pos) +SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, + size_t, count) +{ + return ksys_write(fd, buf, count); +} + +ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, + loff_t pos) { struct fd f; ssize_t ret = -EBADF; @@ -615,8 +630,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, return ret; } -SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, - size_t, count, loff_t, pos) +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, + size_t, count, loff_t, pos) +{ + return ksys_pread64(fd, buf, count, pos); +} + +ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, + size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; @@ -635,6 +656,12 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, return ret; } +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, loff_t, pos) +{ + return ksys_pwrite64(fd, buf, count, pos); +} + static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { diff --git a/fs/readdir.c b/fs/readdir.c index 1b83b0ad183b..d97f548e6323 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -292,8 +292,8 @@ efault: return -EFAULT; } -SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) +int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, + unsigned int count) { struct fd f; struct linux_dirent64 __user * lastdirent; @@ -326,6 +326,13 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, return error; } + +SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + return ksys_getdents64(fd, dirent, count); +} + #ifdef CONFIG_COMPAT struct compat_old_linux_dirent { compat_ulong_t d_ino; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 48835a659948..ae4811fecc1f 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1916,7 +1916,7 @@ struct reiserfs_de_head { /* empty directory contains two entries "." and ".." and their headers */ #define EMPTY_DIR_SIZE \ -(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen (".."))) +(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1)) /* old format directories have this size when empty */ #define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) diff --git a/fs/select.c b/fs/select.c index b6c36254028a..ba879c51288f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -675,8 +675,8 @@ out_nofds: return ret; } -SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, - fd_set __user *, exp, struct timeval __user *, tvp) +static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct timeval tv; @@ -699,6 +699,12 @@ SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, return ret; } +SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval __user *, tvp) +{ + return kern_select(n, inp, outp, exp, tvp); +} + static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize) @@ -784,7 +790,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); + return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif @@ -1259,9 +1265,9 @@ out_nofds: return ret; } -COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) +static int do_compat_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct compat_timeval tv; @@ -1284,6 +1290,13 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, return ret; } +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + return do_compat_select(n, inp, outp, exp, tvp); +} + struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; @@ -1298,8 +1311,8 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); + return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, diff --git a/fs/signalfd.c b/fs/signalfd.c index 9990957264e3..d2187a813376 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -118,13 +118,22 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ if (kinfo->si_signo == SIGBUS && - (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO)) + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif +#ifdef BUS_MCEERR_AR + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_signo == SIGBUS && + kinfo->si_code == BUS_MCEERR_AR) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif @@ -247,8 +256,8 @@ static const struct file_operations signalfd_fops = { .llseek = noop_llseek, }; -SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, - size_t, sizemask, int, flags) +static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, + int flags) { sigset_t sigmask; struct signalfd_ctx *ctx; @@ -301,17 +310,22 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, return ufd; } +SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask, int, flags) +{ + return do_signalfd4(ufd, user_mask, sizemask, flags); +} + SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, size_t, sizemask) { - return sys_signalfd4(ufd, user_mask, sizemask, 0); + return do_signalfd4(ufd, user_mask, sizemask, 0); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, - const compat_sigset_t __user *,sigmask, - compat_size_t, sigsetsize, - int, flags) +static long do_compat_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags) { sigset_t tmp; sigset_t __user *ksigmask; @@ -324,13 +338,21 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) return -EFAULT; - return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); + return do_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize, + int, flags) +{ + return do_compat_signalfd4(ufd, sigmask, sigsetsize, flags); } COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, const compat_sigset_t __user *,sigmask, compat_size_t, sigsetsize) { - return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); + return do_compat_signalfd4(ufd, sigmask, sigsetsize, 0); } #endif diff --git a/fs/splice.c b/fs/splice.c index 39e2dc01ac12..005d09cf3fa8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1331,8 +1331,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, - unsigned long, nr_segs, unsigned int, flags) +static long do_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) { struct fd f; long error; @@ -1358,6 +1358,12 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, return error; } +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) +{ + return do_vmsplice(fd, iov, nr_segs, flags); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, unsigned int, nr_segs, unsigned int, flags) @@ -1375,7 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io put_user(v.iov_len, &iov[i].iov_len)) return -EFAULT; } - return sys_vmsplice(fd, iov, nr_segs, flags); + return do_vmsplice(fd, iov, nr_segs, flags); } #endif diff --git a/fs/stat.c b/fs/stat.c index 873785dae022..f8e6fb2c3657 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -379,8 +379,8 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) return error; } -SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, - char __user *, buf, int, bufsiz) +static int do_readlinkat(int dfd, const char __user *pathname, + char __user *buf, int bufsiz) { struct path path; int error; @@ -415,10 +415,16 @@ retry: return error; } +SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, + char __user *, buf, int, bufsiz) +{ + return do_readlinkat(dfd, pathname, buf, bufsiz); +} + SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, int, bufsiz) { - return sys_readlinkat(AT_FDCWD, path, buf, bufsiz); + return do_readlinkat(AT_FDCWD, path, buf, bufsiz); } diff --git a/fs/sync.c b/fs/sync.c index 6e0a2cbaf6de..b54e0541ad89 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -105,7 +105,7 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) * just write metadata (such as inodes or bitmaps) to block device page cache * and do not sync it on their own in ->sync_fs(). */ -SYSCALL_DEFINE0(sync) +void ksys_sync(void) { int nowait = 0, wait = 1; @@ -117,6 +117,11 @@ SYSCALL_DEFINE0(sync) iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) laptop_sync_completion(); +} + +SYSCALL_DEFINE0(sync) +{ + ksys_sync(); return 0; } @@ -187,12 +192,8 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) if (!file->f_op->fsync) return -EINVAL; - if (!datasync && (inode->i_state & I_DIRTY_TIME)) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); + if (!datasync && (inode->i_state & I_DIRTY_TIME)) mark_inode_dirty_sync(inode); - } return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); @@ -280,8 +281,8 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) * already-instantiated disk blocks, there are no guarantees here that the data * will be available after a crash. */ -SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, - unsigned int, flags) +int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, + unsigned int flags) { int ret; struct fd f; @@ -359,10 +360,16 @@ out: return ret; } +SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, + unsigned int, flags) +{ + return ksys_sync_file_range(fd, offset, nbytes, flags); +} + /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, loff_t, offset, loff_t, nbytes) { - return sys_sync_file_range(fd, offset, nbytes, flags); + return ksys_sync_file_range(fd, offset, nbytes, flags); } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 8664db25a9a6..215c225b2ca1 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, { return sysfs_do_create_link(kobj, target, name, 0); } +EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cf348ba99238..1acb2ff505e6 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1256,7 +1256,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, * Inode length changed, so we have to make sure * @I_DIRTY_DATASYNC is set. */ - __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); else mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/udf/file.c b/fs/udf/file.c index 356c2bf148a5..cd31e4f6d6da 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -257,12 +257,22 @@ const struct file_operations udf_file_operations = { static int udf_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; int error; error = setattr_prepare(dentry, attr); if (error) return error; + if ((attr->ia_valid & ATTR_UID) && + UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET) && + !uid_eq(attr->ia_uid, UDF_SB(sb)->s_uid)) + return -EPERM; + if ((attr->ia_valid & ATTR_GID) && + UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET) && + !gid_eq(attr->ia_gid, UDF_SB(sb)->s_gid)) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { error = udf_setsize(inode, attr->ia_size); diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index b6e420c1bfeb..b7a0d4b4bda1 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -104,6 +104,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) } inode_init_owner(inode, dir, mode); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) + inode->i_uid = sbi->s_uid; + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) + inode->i_gid = sbi->s_gid; iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c23744d5ae5c..c80765d62f7e 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1275,6 +1275,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; int ret = -EIO; + uint32_t uid, gid; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { @@ -1400,17 +1401,19 @@ reread: ret = -EIO; read_lock(&sbi->s_cred_lock); - i_uid_write(inode, le32_to_cpu(fe->uid)); - if (!uid_valid(inode->i_uid) || - UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || + uid = le32_to_cpu(fe->uid); + if (uid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) - inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + inode->i_uid = sbi->s_uid; + else + i_uid_write(inode, uid); - i_gid_write(inode, le32_to_cpu(fe->gid)); - if (!gid_valid(inode->i_gid) || - UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || + gid = le32_to_cpu(fe->gid); + if (gid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) - inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + inode->i_gid = sbi->s_gid; + else + i_gid_write(inode, gid); if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) @@ -1655,12 +1658,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) - fe->uid = cpu_to_le32(-1); + fe->uid = cpu_to_le32(UDF_INVALID_ID); else fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) - fe->gid = cpu_to_le32(-1); + fe->gid = cpu_to_le32(UDF_INVALID_ID); else fe->gid = cpu_to_le32(i_gid_read(inode)); diff --git a/fs/udf/super.c b/fs/udf/super.c index f73239a9a97d..7949c338efa5 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -64,14 +64,13 @@ #include <linux/init.h> #include <linux/uaccess.h> -#define VDS_POS_PRIMARY_VOL_DESC 0 -#define VDS_POS_UNALLOC_SPACE_DESC 1 -#define VDS_POS_LOGICAL_VOL_DESC 2 -#define VDS_POS_PARTITION_DESC 3 -#define VDS_POS_IMP_USE_VOL_DESC 4 -#define VDS_POS_VOL_DESC_PTR 5 -#define VDS_POS_TERMINATING_DESC 6 -#define VDS_POS_LENGTH 7 +enum { + VDS_POS_PRIMARY_VOL_DESC, + VDS_POS_UNALLOC_SPACE_DESC, + VDS_POS_LOGICAL_VOL_DESC, + VDS_POS_IMP_USE_VOL_DESC, + VDS_POS_LENGTH +}; #define VSD_FIRST_SECTOR_OFFSET 32768 #define VSD_MAX_SECTOR_OFFSET 0x800000 @@ -223,10 +222,6 @@ struct udf_options { unsigned int session; unsigned int lastblock; unsigned int anchor; - unsigned int volume; - unsigned short partition; - unsigned int fileset; - unsigned int rootdir; unsigned int flags; umode_t umask; kgid_t gid; @@ -349,12 +344,8 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",shortad"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET)) seq_puts(seq, ",uid=forget"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE)) - seq_puts(seq, ",uid=ignore"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET)) seq_puts(seq, ",gid=forget"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) - seq_puts(seq, ",gid=ignore"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid)); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) @@ -371,10 +362,6 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); - /* - * volume, partition, fileset and rootdir seem to be ignored - * currently - */ if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) seq_puts(seq, ",utf8"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) @@ -487,14 +474,9 @@ static int udf_parse_options(char *options, struct udf_options *uopt, int option; uopt->novrs = 0; - uopt->partition = 0xFFFF; uopt->session = 0xFFFFFFFF; uopt->lastblock = 0; uopt->anchor = 0; - uopt->volume = 0xFFFFFFFF; - uopt->rootdir = 0xFFFFFFFF; - uopt->fileset = 0xFFFFFFFF; - uopt->nls_map = NULL; if (!options) return 1; @@ -582,42 +564,30 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->anchor = option; break; case Opt_volume: - if (match_int(args, &option)) - return 0; - uopt->volume = option; - break; case Opt_partition: - if (match_int(args, &option)) - return 0; - uopt->partition = option; - break; case Opt_fileset: - if (match_int(args, &option)) - return 0; - uopt->fileset = option; - break; case Opt_rootdir: - if (match_int(args, &option)) - return 0; - uopt->rootdir = option; + /* Ignored (never implemented properly) */ break; case Opt_utf8: uopt->flags |= (1 << UDF_FLAG_UTF8); break; #ifdef CONFIG_UDF_NLS case Opt_iocharset: - uopt->nls_map = load_nls(args[0].from); - uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + if (!remount) { + if (uopt->nls_map) + unload_nls(uopt->nls_map); + uopt->nls_map = load_nls(args[0].from); + uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + } break; #endif - case Opt_uignore: - uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); - break; case Opt_uforget: uopt->flags |= (1 << UDF_FLAG_UID_FORGET); break; + case Opt_uignore: case Opt_gignore: - uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + /* These options are superseeded by uid=<number> */ break; case Opt_gforget: uopt->flags |= (1 << UDF_FLAG_GID_FORGET); @@ -660,6 +630,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) uopt.umask = sbi->s_umask; uopt.fmode = sbi->s_fmode; uopt.dmode = sbi->s_dmode; + uopt.nls_map = NULL; if (!udf_parse_options(options, &uopt, true)) return -EINVAL; @@ -1592,6 +1563,60 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ sbi->s_lvid_bh = NULL; } +/* + * Step for reallocation of table of partition descriptor sequence numbers. + * Must be power of 2. + */ +#define PART_DESC_ALLOC_STEP 32 + +struct desc_seq_scan_data { + struct udf_vds_record vds[VDS_POS_LENGTH]; + unsigned int size_part_descs; + struct udf_vds_record *part_descs_loc; +}; + +static struct udf_vds_record *handle_partition_descriptor( + struct buffer_head *bh, + struct desc_seq_scan_data *data) +{ + struct partitionDesc *desc = (struct partitionDesc *)bh->b_data; + int partnum; + + partnum = le16_to_cpu(desc->partitionNumber); + if (partnum >= data->size_part_descs) { + struct udf_vds_record *new_loc; + unsigned int new_size = ALIGN(partnum, PART_DESC_ALLOC_STEP); + + new_loc = kzalloc(sizeof(*new_loc) * new_size, GFP_KERNEL); + if (!new_loc) + return ERR_PTR(-ENOMEM); + memcpy(new_loc, data->part_descs_loc, + data->size_part_descs * sizeof(*new_loc)); + kfree(data->part_descs_loc); + data->part_descs_loc = new_loc; + data->size_part_descs = new_size; + } + return &(data->part_descs_loc[partnum]); +} + + +static struct udf_vds_record *get_volume_descriptor_record(uint16_t ident, + struct buffer_head *bh, struct desc_seq_scan_data *data) +{ + switch (ident) { + case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ + return &(data->vds[VDS_POS_PRIMARY_VOL_DESC]); + case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ + return &(data->vds[VDS_POS_IMP_USE_VOL_DESC]); + case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ + return &(data->vds[VDS_POS_LOGICAL_VOL_DESC]); + case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ + return &(data->vds[VDS_POS_UNALLOC_SPACE_DESC]); + case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ + return handle_partition_descriptor(bh, data); + } + return NULL; +} /* * Process a main/reserve volume descriptor sequence. @@ -1608,18 +1633,23 @@ static noinline int udf_process_sequence( struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; - struct udf_vds_record vds[VDS_POS_LENGTH]; struct udf_vds_record *curr; struct generic_desc *gd; struct volDescPtr *vdp; bool done = false; uint32_t vdsn; uint16_t ident; - long next_s = 0, next_e = 0; int ret; unsigned int indirections = 0; - - memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); + struct desc_seq_scan_data data; + unsigned int i; + + memset(data.vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); + data.size_part_descs = PART_DESC_ALLOC_STEP; + data.part_descs_loc = kzalloc(sizeof(*data.part_descs_loc) * + data.size_part_descs, GFP_KERNEL); + if (!data.part_descs_loc) + return -ENOMEM; /* * Read the main descriptor sequence and find which descriptors @@ -1628,79 +1658,51 @@ static noinline int udf_process_sequence( for (; (!done && block <= lastblock); block++) { bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) { - udf_err(sb, - "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", - (unsigned long long)block); - return -EAGAIN; - } + if (!bh) + break; /* Process each descriptor (ISO 13346 3/8.3-8.4) */ gd = (struct generic_desc *)bh->b_data; vdsn = le32_to_cpu(gd->volDescSeqNum); switch (ident) { - case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ - curr = &vds[VDS_POS_PRIMARY_VOL_DESC]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - } - break; case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */ - curr = &vds[VDS_POS_VOL_DESC_PTR]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - - vdp = (struct volDescPtr *)bh->b_data; - next_s = le32_to_cpu( - vdp->nextVolDescSeqExt.extLocation); - next_e = le32_to_cpu( - vdp->nextVolDescSeqExt.extLength); - next_e = next_e >> sb->s_blocksize_bits; - next_e += next_s; + if (++indirections > UDF_MAX_TD_NESTING) { + udf_err(sb, "too many Volume Descriptor " + "Pointers (max %u supported)\n", + UDF_MAX_TD_NESTING); + brelse(bh); + return -EIO; } + + vdp = (struct volDescPtr *)bh->b_data; + block = le32_to_cpu(vdp->nextVolDescSeqExt.extLocation); + lastblock = le32_to_cpu( + vdp->nextVolDescSeqExt.extLength) >> + sb->s_blocksize_bits; + lastblock += block - 1; + /* For loop is going to increment 'block' again */ + block--; break; + case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ - curr = &vds[VDS_POS_IMP_USE_VOL_DESC]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - } - break; - case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ - curr = &vds[VDS_POS_PARTITION_DESC]; - if (!curr->block) - curr->block = block; - break; case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ - curr = &vds[VDS_POS_LOGICAL_VOL_DESC]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - } - break; case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ - curr = &vds[VDS_POS_UNALLOC_SPACE_DESC]; + case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ + curr = get_volume_descriptor_record(ident, bh, &data); + if (IS_ERR(curr)) { + brelse(bh); + return PTR_ERR(curr); + } + /* Descriptor we don't care about? */ + if (!curr) + break; if (vdsn >= curr->volDescSeqNum) { curr->volDescSeqNum = vdsn; curr->block = block; } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ - if (++indirections > UDF_MAX_TD_NESTING) { - udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING); - brelse(bh); - return -EIO; - } - - vds[VDS_POS_TERMINATING_DESC].block = block; - if (next_e) { - block = next_s; - lastblock = next_e; - next_s = next_e = 0; - } else - done = true; + done = true; break; } brelse(bh); @@ -1709,31 +1711,27 @@ static noinline int udf_process_sequence( * Now read interesting descriptors again and process them * in a suitable order */ - if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { + if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); return -EAGAIN; } - ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block); + ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block); if (ret < 0) return ret; - if (vds[VDS_POS_LOGICAL_VOL_DESC].block) { + if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) { ret = udf_load_logicalvol(sb, - vds[VDS_POS_LOGICAL_VOL_DESC].block, - fileset); + data.vds[VDS_POS_LOGICAL_VOL_DESC].block, + fileset); if (ret < 0) return ret; } - if (vds[VDS_POS_PARTITION_DESC].block) { - /* - * We rescan the whole descriptor sequence to find - * partition descriptor blocks and process them. - */ - for (block = vds[VDS_POS_PARTITION_DESC].block; - block < vds[VDS_POS_TERMINATING_DESC].block; - block++) { - ret = udf_load_partdesc(sb, block); + /* Now handle prevailing Partition Descriptors */ + for (i = 0; i < data.size_part_descs; i++) { + if (data.part_descs_loc[i].block) { + ret = udf_load_partdesc(sb, + data.part_descs_loc[i].block); if (ret < 0) return ret; } @@ -1760,13 +1758,13 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); main_e = main_e >> sb->s_blocksize_bits; - main_e += main_s; + main_e += main_s - 1; /* Locate the reserve sequence */ reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); reserve_e = reserve_e >> sb->s_blocksize_bits; - reserve_e += reserve_s; + reserve_e += reserve_s - 1; /* Process the main & reserve sequences */ /* responsible for finding the PartitionDesc(s) */ @@ -1994,7 +1992,10 @@ static void udf_open_lvid(struct super_block *sb) lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; ktime_get_real_ts(&ts); udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); - lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); + if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE) + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); + else + UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT); lvid->descTag.descCRC = cpu_to_le16( crc_itu_t(0, (char *)lvid + sizeof(struct tag), @@ -2034,7 +2035,8 @@ static void udf_close_lvid(struct super_block *sb) lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); - lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT)) + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); lvid->descTag.descCRC = cpu_to_le16( crc_itu_t(0, (char *)lvid + sizeof(struct tag), @@ -2091,11 +2093,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) bool lvid_open = false; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); - uopt.uid = INVALID_UID; - uopt.gid = INVALID_GID; + /* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */ + uopt.uid = make_kuid(current_user_ns(), overflowuid); + uopt.gid = make_kgid(current_user_ns(), overflowgid); uopt.umask = 0; uopt.fmode = UDF_INVALID_MODE; uopt.dmode = UDF_INVALID_MODE; + uopt.nls_map = NULL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -2276,8 +2280,8 @@ error_out: iput(sbi->s_vat_inode); parse_options_failure: #ifdef CONFIG_UDF_NLS - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - unload_nls(sbi->s_nls_map); + if (uopt.nls_map) + unload_nls(uopt.nls_map); #endif if (lvid_open) udf_close_lvid(sb); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 68c9f1d618f5..9dd3e1b9619e 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -23,14 +23,13 @@ #define UDF_FLAG_NLS_MAP 9 #define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ -#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ -#define UDF_FLAG_GID_FORGET 13 -#define UDF_FLAG_GID_IGNORE 14 -#define UDF_FLAG_UID_SET 15 -#define UDF_FLAG_GID_SET 16 -#define UDF_FLAG_SESSION_SET 17 -#define UDF_FLAG_LASTBLOCK_SET 18 -#define UDF_FLAG_BLOCKSIZE_SET 19 +#define UDF_FLAG_GID_FORGET 12 +#define UDF_FLAG_UID_SET 13 +#define UDF_FLAG_GID_SET 14 +#define UDF_FLAG_SESSION_SET 15 +#define UDF_FLAG_LASTBLOCK_SET 16 +#define UDF_FLAG_BLOCKSIZE_SET 17 +#define UDF_FLAG_INCONSISTENT 18 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index f5e0fe78979e..68e8a64d22e0 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -48,6 +48,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb, #define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF #define UDF_EXTENT_FLAG_MASK 0xC0000000 +#define UDF_INVALID_ID ((uint32_t)-1) + #define UDF_NAME_PAD 4 #define UDF_NAME_LEN 254 #define UDF_NAME_LEN_CS0 255 diff --git a/fs/utimes.c b/fs/utimes.c index e4b3d7c2c9f5..69d4b6ba1bfb 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -184,8 +184,8 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } -SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, - struct timeval __user *, utimes) +static long do_futimesat(int dfd, const char __user *filename, + struct timeval __user *utimes) { struct timeval times[2]; struct timespec64 tstimes[2]; @@ -212,10 +212,17 @@ SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } + +SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, + struct timeval __user *, utimes) +{ + return do_futimesat(dfd, filename, utimes); +} + SYSCALL_DEFINE2(utimes, char __user *, filename, struct timeval __user *, utimes) { - return sys_futimesat(AT_FDCWD, filename, utimes); + return do_futimesat(AT_FDCWD, filename, utimes); } #ifdef CONFIG_COMPAT @@ -253,7 +260,8 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena return do_utimes(dfd, filename, t ? tv : NULL, flags); } -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) +static long do_compat_futimesat(unsigned int dfd, const char __user *filename, + struct compat_timeval __user *t) { struct timespec64 tv[2]; @@ -272,8 +280,15 @@ COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filena return do_utimes(dfd, filename, t ? tv : NULL, 0); } +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, + const char __user *, filename, + struct compat_timeval __user *, t) +{ + return do_compat_futimesat(dfd, filename, t); +} + COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) { - return compat_sys_futimesat(AT_FDCWD, filename, t); + return do_compat_futimesat(AT_FDCWD, filename, t); } #endif diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 393b6849aeb3..7bace03dc9dc 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -46,13 +46,13 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } void * -kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +kmem_alloc_large(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; gfp_t lflags; - ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + ptr = kmem_alloc(size, flags | KM_MAYFAIL); if (ptr) return ptr; @@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags, PAGE_KERNEL); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 4b87472f35bc..6023b594ead7 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -71,7 +71,7 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); +extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { @@ -85,6 +85,12 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return kmem_alloc(size, flags | KM_ZERO); } +static inline void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc_large(size, flags | KM_ZERO); +} + /* * Zone interfaces */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 2291f4224e24..03885a968de8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -95,13 +95,13 @@ xfs_ag_resv_critical( switch (type) { case XFS_AG_RESV_METADATA: - avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; + avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved; orig = pag->pag_meta_resv.ar_asked; break; - case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: avail = pag->pagf_freeblks + pag->pagf_flcount - pag->pag_meta_resv.ar_reserved; - orig = pag->pag_agfl_resv.ar_asked; + orig = pag->pag_rmapbt_resv.ar_asked; break; default: ASSERT(0); @@ -126,10 +126,10 @@ xfs_ag_resv_needed( { xfs_extlen_t len; - len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; + len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved; switch (type) { case XFS_AG_RESV_METADATA: - case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; case XFS_AG_RESV_NONE: @@ -160,10 +160,11 @@ __xfs_ag_resv_free( if (pag->pag_agno == 0) pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* - * AGFL blocks are always considered "free", so whatever - * was reserved at mount time must be given back at umount. + * RMAPBT blocks come from the AGFL and AGFL blocks are always + * considered "free", so whatever was reserved at mount time must be + * given back at umount. */ - if (type == XFS_AG_RESV_AGFL) + if (type == XFS_AG_RESV_RMAPBT) oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; @@ -185,7 +186,7 @@ xfs_ag_resv_free( int error; int err2; - error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); + error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); if (err2 && !error) error = err2; @@ -284,15 +285,15 @@ xfs_ag_resv_init( } } - /* Create the AGFL metadata reservation */ - if (pag->pag_agfl_resv.ar_asked == 0) { + /* Create the RMAPBT metadata reservation */ + if (pag->pag_rmapbt_resv.ar_asked == 0) { ask = used = 0; error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); if (error) goto out; } @@ -304,7 +305,7 @@ xfs_ag_resv_init( return error; ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= pag->pagf_freeblks + pag->pagf_flcount); #endif out: @@ -325,8 +326,10 @@ xfs_ag_resv_alloc_extent( trace_xfs_ag_resv_alloc_extent(pag, type, args->len); switch (type) { - case XFS_AG_RESV_METADATA: case XFS_AG_RESV_AGFL: + return; + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_RMAPBT: resv = xfs_perag_resv(pag, type); break; default: @@ -341,7 +344,7 @@ xfs_ag_resv_alloc_extent( len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); resv->ar_reserved -= len; - if (type == XFS_AG_RESV_AGFL) + if (type == XFS_AG_RESV_RMAPBT) return; /* Allocations of reserved blocks only need on-disk sb updates... */ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); @@ -365,8 +368,10 @@ xfs_ag_resv_free_extent( trace_xfs_ag_resv_free_extent(pag, type, len); switch (type) { - case XFS_AG_RESV_METADATA: case XFS_AG_RESV_AGFL: + return; + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_RMAPBT: resv = xfs_perag_resv(pag, type); break; default: @@ -379,7 +384,7 @@ xfs_ag_resv_free_extent( leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); resv->ar_reserved += leftover; - if (type == XFS_AG_RESV_AGFL) + if (type == XFS_AG_RESV_RMAPBT) return; /* Freeing into the reserved pool only requires on-disk update... */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index 8d6c687deef3..938f2f96c5e8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -32,4 +32,35 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_trans *tp, xfs_extlen_t len); +/* + * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from + * the AGFL, they are allocated one at a time and the reservation updates don't + * require a transaction. + */ +static inline void +xfs_ag_resv_rmapbt_alloc( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_alloc_arg args = {0}; + struct xfs_perag *pag; + + args.len = 1; + pag = xfs_perag_get(mp, agno); + xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); + xfs_perag_put(pag); +} + +static inline void +xfs_ag_resv_rmapbt_free( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); + xfs_perag_put(pag); +} + #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c02781a4c091..39387bdd225d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -53,6 +53,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in + * the beginning of the block for a proper header with the location information + * and CRC. + */ +unsigned int +xfs_agfl_size( + struct xfs_mount *mp) +{ + unsigned int size = mp->m_sb.sb_sectsize; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + size -= sizeof(struct xfs_agfl); + + return size / sizeof(xfs_agblock_t); +} + unsigned int xfs_refc_block( struct xfs_mount *mp) @@ -550,7 +567,7 @@ xfs_agfl_verify( if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) return __this_address; - for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + for (i = 0; i < xfs_agfl_size(mp); i++) { if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; @@ -1564,7 +1581,6 @@ xfs_alloc_ag_vextent_small( int *stat) /* status: 0-freelist, 1-normal/none */ { struct xfs_owner_info oinfo; - struct xfs_perag *pag; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1616,18 +1632,13 @@ xfs_alloc_ag_vextent_small( /* * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear - * out the OWN_AG rmap and add the block back to - * the AGFL per-AG reservation. + * out the OWN_AG rmap. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, &oinfo); if (error) goto error0; - pag = xfs_perag_get(args->mp, args->agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, - args->tp, 1); - xfs_perag_put(pag); *stat = 0; return 0; @@ -1911,14 +1922,12 @@ xfs_free_ag_extent( XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, - haveleft, haveright); + trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, - -1, -1); + trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2054,6 +2063,93 @@ xfs_alloc_space_available( } /* + * Check the agfl fields of the agf for inconsistency or corruption. The purpose + * is to detect an agfl header padding mismatch between current and early v5 + * kernels. This problem manifests as a 1-slot size difference between the + * on-disk flcount and the active [first, last] range of a wrapped agfl. This + * may also catch variants of agfl count corruption unrelated to padding. Either + * way, we'll reset the agfl and warn the user. + * + * Return true if a reset is required before the agfl can be used, false + * otherwise. + */ +static bool +xfs_agfl_needs_reset( + struct xfs_mount *mp, + struct xfs_agf *agf) +{ + uint32_t f = be32_to_cpu(agf->agf_flfirst); + uint32_t l = be32_to_cpu(agf->agf_fllast); + uint32_t c = be32_to_cpu(agf->agf_flcount); + int agfl_size = xfs_agfl_size(mp); + int active; + + /* no agfl header on v4 supers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + + /* + * The agf read verifier catches severe corruption of these fields. + * Repeat some sanity checks to cover a packed -> unpacked mismatch if + * the verifier allows it. + */ + if (f >= agfl_size || l >= agfl_size) + return true; + if (c > agfl_size) + return true; + + /* + * Check consistency between the on-disk count and the active range. An + * agfl padding mismatch manifests as an inconsistent flcount. + */ + if (c && l >= f) + active = l - f + 1; + else if (c) + active = agfl_size - f + l + 1; + else + active = 0; + + return active != c; +} + +/* + * Reset the agfl to an empty state. Ignore/drop any existing blocks since the + * agfl content cannot be trusted. Warn the user that a repair is required to + * recover leaked blocks. + * + * The purpose of this mechanism is to handle filesystems affected by the agfl + * header padding mismatch problem. A reset keeps the filesystem online with a + * relatively minor free space accounting inconsistency rather than suffer the + * inevitable crash from use of an invalid agfl block. + */ +static void +xfs_agfl_reset( + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + + ASSERT(pag->pagf_agflreset); + trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); + + xfs_warn(mp, + "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " + "Please unmount and run xfs_repair.", + pag->pag_agno, pag->pagf_flcount); + + agf->agf_flfirst = 0; + agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); + agf->agf_flcount = 0; + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | + XFS_AGF_FLCOUNT); + + pag->pagf_flcount = 0; + pag->pagf_agflreset = false; +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -2114,6 +2210,10 @@ xfs_alloc_fix_freelist( } } + /* reset a padding mismatched agfl before final free space check */ + if (pag->pagf_agflreset) + xfs_agfl_reset(tp, agbp, pag); + /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); if (!xfs_alloc_space_available(args, need, flags)) @@ -2266,10 +2366,11 @@ xfs_alloc_get_freelist( bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); - if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) + if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; @@ -2377,10 +2478,11 @@ xfs_alloc_put_freelist( be32_to_cpu(agf->agf_seqno), &agflbp))) return error; be32_add_cpu(&agf->agf_fllast, 1); - if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) + if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; @@ -2395,7 +2497,7 @@ xfs_alloc_put_freelist( xfs_alloc_log_agf(tp, agbp, logflags); - ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); + ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; @@ -2428,9 +2530,9 @@ xfs_agf_verify( if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && + be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && + be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || @@ -2588,6 +2690,7 @@ xfs_alloc_read_agf( pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; + pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 65a0cafe06e4..a311a2414a6b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -26,6 +26,8 @@ struct xfs_trans; extern struct workqueue_struct *xfs_alloc_wq; +unsigned int xfs_agfl_size(struct xfs_mount *mp); + /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6840b588187e..b451649ba176 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -74,18 +74,13 @@ xfs_allocbt_alloc_block( int error; xfs_agblock_t bno; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* Allocate the new block from the freelist. If we can't, give up. */ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, &bno, 1); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } if (bno == NULLAGBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -95,7 +90,6 @@ xfs_allocbt_alloc_block( xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index daae00ed30c5..3b03d886df66 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1244,8 +1244,9 @@ xfs_iread_extents( xfs_warn(ip->i_mount, "corrupt dinode %Lu, (btree extents).", (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, ip->i_mount, block); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + __func__, block, sizeof(*block), + __this_address); error = -EFSCORRUPTED; goto out_brelse; } @@ -1261,11 +1262,15 @@ xfs_iread_extents( */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); for (j = 0; j < num_recs; j++, frp++, i++) { + xfs_failaddr_t fa; + xfs_bmbt_disk_get_all(frp, &new); - if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, mp); + fa = xfs_bmap_validate_extent(ip, whichfork, &new); + if (fa) { error = -EFSCORRUPTED; + xfs_inode_verifier_error(ip, error, + "xfs_iread_extents(2)", + frp, sizeof(*frp), fa); goto out_brelse; } xfs_iext_insert(ip, &icur, &new, state); @@ -6154,3 +6159,39 @@ xfs_bmap_finish_one( return error; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsblock_t endfsb; + bool isrt; + + isrt = XFS_IS_REALTIME_INODE(ip); + endfsb = irec->br_startblock + irec->br_blockcount - 1; + if (isrt) { + if (!xfs_verify_rtbno(mp, irec->br_startblock)) + return __this_address; + if (!xfs_verify_rtbno(mp, endfsb)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, irec->br_startblock)) + return __this_address; + if (!xfs_verify_fsbno(mp, endfsb)) + return __this_address; + if (XFS_FSB_TO_AGNO(mp, irec->br_startblock) != + XFS_FSB_TO_AGNO(mp, endfsb)) + return __this_address; + } + if (irec->br_state != XFS_EXT_NORM) { + if (whichfork != XFS_DATA_FORK) + return __this_address; + if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) + return __this_address; + } + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e36d75799cd5..f3be6416260b 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -274,4 +274,7 @@ static inline int xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 9faf479aba49..d89d06bea6e3 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -272,10 +272,10 @@ xfs_bmbt_alloc_block( cur->bc_private.b.dfops->dop_low = true; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } + ASSERT(args.len == 1); cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; @@ -286,12 +286,10 @@ xfs_bmbt_alloc_block( new->l = cpu_to_be64(args.fsbno); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 135b8c56d23e..e4505746ccaa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -118,18 +118,4 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); -/* - * Check that the extent does not contain an invalid unwritten extent flag. - */ -static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, - struct xfs_bmbt_irec *irec) -{ - if (irec->br_state == XFS_EXT_NORM) - return true; - if (whichfork == XFS_DATA_FORK && - xfs_sb_version_hasextflgbit(&mp->m_sb)) - return true; - return false; -} - #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 79ee4a1951d1..edc0193358a5 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1438,8 +1438,6 @@ xfs_btree_log_keys( int first, int last) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); @@ -1450,8 +1448,6 @@ xfs_btree_log_keys( xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, xfs_ilog_fbroot(cur->bc_private.b.whichfork)); } - - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1464,15 +1460,12 @@ xfs_btree_log_recs( int first, int last) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1485,8 +1478,6 @@ xfs_btree_log_ptrs( int first, /* index of first pointer to log */ int last) /* index of last pointer to log */ { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -1501,7 +1492,6 @@ xfs_btree_log_ptrs( xfs_ilog_fbroot(cur->bc_private.b.whichfork)); } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1543,9 +1533,6 @@ xfs_btree_log_block( XFS_BTREE_LBLOCK_CRC_LEN }; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBI(cur, bp, fields); - if (bp) { int nbits; @@ -1573,8 +1560,6 @@ xfs_btree_log_block( xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, xfs_ilog_fbroot(cur->bc_private.b.whichfork)); } - - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1593,9 +1578,6 @@ xfs_btree_increment( int error; /* error return value */ int lev; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); /* Read-ahead to the right at this level. */ @@ -1671,17 +1653,14 @@ xfs_btree_increment( cur->bc_ptrs[lev] = 1; } out1: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -1701,9 +1680,6 @@ xfs_btree_decrement( int lev; union xfs_btree_ptr ptr; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); /* Read-ahead to the left at this level. */ @@ -1769,17 +1745,14 @@ xfs_btree_decrement( cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); } out1: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -1881,9 +1854,6 @@ xfs_btree_lookup( union xfs_btree_ptr *pp; /* ptr to btree block */ union xfs_btree_ptr ptr; /* ptr to btree block */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, dir); - XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ @@ -1929,7 +1899,6 @@ xfs_btree_lookup( ASSERT(level == 0 && cur->bc_nlevels == 1); cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -2004,7 +1973,6 @@ xfs_btree_lookup( if (error) goto error0; XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } @@ -2019,11 +1987,9 @@ xfs_btree_lookup( *stat = 1; else *stat = 0; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -2169,10 +2135,8 @@ __xfs_btree_updkeys( trace_xfs_btree_updkeys(cur, level, bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } #endif ptr = cur->bc_ptrs[level]; nlkey = xfs_btree_key_addr(cur, ptr, block); @@ -2224,9 +2188,6 @@ xfs_btree_update_keys( if (cur->bc_flags & XFS_BTREE_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIK(cur, level, keyp); - /* * Go up the tree from this level toward the root. * At each level, update the key value to the value input. @@ -2241,10 +2202,8 @@ xfs_btree_update_keys( block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } #endif ptr = cur->bc_ptrs[level]; kp = xfs_btree_key_addr(cur, ptr, block); @@ -2252,7 +2211,6 @@ xfs_btree_update_keys( xfs_btree_log_keys(cur, bp, ptr, ptr); } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } @@ -2272,9 +2230,6 @@ xfs_btree_update( int ptr; union xfs_btree_rec *rp; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGR(cur, rec); - /* Pick up the current block. */ block = xfs_btree_get_block(cur, 0, &bp); @@ -2307,11 +2262,9 @@ xfs_btree_update( goto error0; } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -2339,9 +2292,6 @@ xfs_btree_lshift( int error; /* error return value */ int i; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) goto out0; @@ -2500,21 +2450,17 @@ xfs_btree_lshift( /* Slide the cursor value left one. */ cur->bc_ptrs[level]--; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; error1: - XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } @@ -2541,9 +2487,6 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && (level == cur->bc_nlevels - 1)) goto out0; @@ -2676,21 +2619,17 @@ xfs_btree_rshift( xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; error1: - XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } @@ -2726,9 +2665,6 @@ __xfs_btree_split( int i; #endif - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); - XFS_BTREE_STATS_INC(cur, split); /* Set up left block (current one). */ @@ -2878,16 +2814,13 @@ __xfs_btree_split( (*curp)->bc_ptrs[level + 1]++; } *ptrp = rptr; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -2994,7 +2927,6 @@ xfs_btree_new_iroot( int i; /* loop counter */ #endif - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_STATS_INC(cur, newroot); ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); @@ -3008,10 +2940,9 @@ xfs_btree_new_iroot( error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); if (error) goto error0; - if (*stat == 0) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + if (*stat == 0) return 0; - } + XFS_BTREE_STATS_INC(cur, alloc); /* Copy the root into a real block. */ @@ -3074,10 +3005,8 @@ xfs_btree_new_iroot( *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); *stat = 1; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -3102,7 +3031,6 @@ xfs_btree_new_root( union xfs_btree_ptr rptr; union xfs_btree_ptr lptr; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ @@ -3202,14 +3130,11 @@ xfs_btree_new_root( xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); cur->bc_ptrs[cur->bc_nlevels] = nptr; cur->bc_nlevels++; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -3230,7 +3155,7 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_private.b.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ @@ -3309,9 +3234,6 @@ xfs_btree_insrec( #endif xfs_daddr_t old_bn; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); - ncur = NULL; lkey = &nkey; @@ -3324,14 +3246,12 @@ xfs_btree_insrec( error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return error; } /* If we're off the left edge, return failure. */ ptr = cur->bc_ptrs[level]; if (ptr == 0) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -3489,12 +3409,10 @@ xfs_btree_insrec( *curp = ncur; } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -3572,11 +3490,9 @@ xfs_btree_insert( } } while (!xfs_btree_ptr_is_null(cur, &nptr)); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = i; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -3611,8 +3527,6 @@ xfs_btree_kill_iroot( int i; #endif - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(cur->bc_nlevels > 1); @@ -3670,19 +3584,15 @@ xfs_btree_kill_iroot( #ifdef DEBUG for (i = 0; i < numrecs; i++) { error = xfs_btree_check_ptr(cur, cpp, i, level - 1); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } } #endif xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); error = xfs_btree_free_block(cur, cbp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); @@ -3690,7 +3600,6 @@ xfs_btree_kill_iroot( XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); cur->bc_nlevels--; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } @@ -3706,7 +3615,6 @@ xfs_btree_kill_root( { int error; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_STATS_INC(cur, killroot); /* @@ -3716,16 +3624,13 @@ xfs_btree_kill_root( cur->bc_ops->set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } cur->bc_bufs[level] = NULL; cur->bc_ra[level] = 0; cur->bc_nlevels--; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } @@ -3744,7 +3649,6 @@ xfs_btree_dec_cursor( return error; } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } @@ -3780,15 +3684,11 @@ xfs_btree_delrec( struct xfs_btree_cur *tcur; /* temporary btree cursor */ int numrecs; /* temporary numrec count */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - tcur = NULL; /* Get the index of the entry being deleted, check for nothing there. */ ptr = cur->bc_ptrs[level]; if (ptr == 0) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -3805,7 +3705,6 @@ xfs_btree_delrec( /* Fail if we're off the end of the block. */ if (ptr > numrecs) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -4080,7 +3979,7 @@ xfs_btree_delrec( tcur = NULL; if (level == 0) cur->bc_ptrs[0]++; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; return 0; } @@ -4250,13 +4149,11 @@ xfs_btree_delrec( * call updkeys directly. */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); /* Return value means the next level up has something to do. */ *stat = 2; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); if (tcur) xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; @@ -4277,8 +4174,6 @@ xfs_btree_delete( int i; bool joined = false; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* * Go up the tree, starting at leaf level. * @@ -4314,11 +4209,9 @@ xfs_btree_delete( } } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = i; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 50440b5618e8..58e30c0975c3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -473,25 +473,6 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -/* - * Trace hooks. Currently not implemented as they need to be ported - * over to the generic tracing functionality, which is some effort. - * - * i,j = integer (32 bit) - * b = btree block buffer (xfs_buf_t) - * p = btree ptr - * r = btree record - * k = btree key - */ -#define XFS_BTREE_TRACE_ARGBI(c, b, i) -#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) -#define XFS_BTREE_TRACE_ARGI(c, i) -#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) -#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) -#define XFS_BTREE_TRACE_ARGIK(c, i, k) -#define XFS_BTREE_TRACE_ARGR(c, r) -#define XFS_BTREE_TRACE_CURSOR(c, t) - xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 388d67c5c903..989e95a53db2 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -173,7 +173,7 @@ extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, extern void xfs_dir2_data_make_free(struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_da_args *args, +extern int xfs_dir2_data_use_free(struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 2da86a394bcf..875893ded514 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -451,15 +451,19 @@ xfs_dir2_block_addname( * No stale entries, will use enddup space to hold new leaf. */ if (!btp->stale) { + xfs_dir2_data_aoff_t aoff; + /* * Mark the space needed for the new leaf entry, now in use. */ - xfs_dir2_data_use_free(args, bp, enddup, - (xfs_dir2_data_aoff_t) - ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - - sizeof(*blp)), - (xfs_dir2_data_aoff_t)sizeof(*blp), - &needlog, &needscan); + aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr + + be16_to_cpu(enddup->length) - sizeof(*blp)); + error = xfs_dir2_data_use_free(args, bp, enddup, aoff, + (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog, + &needscan); + if (error) + return error; + /* * Update the tail (entry count). */ @@ -541,9 +545,11 @@ xfs_dir2_block_addname( /* * Mark space for the data entry used. */ - xfs_dir2_data_use_free(args, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), - (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + error = xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + if (error) + return error; /* * Create the new data entry. */ @@ -997,8 +1003,10 @@ xfs_dir2_leaf_to_block( /* * Use up the space at the end of the block (blp/btp). */ - xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, - &needlog, &needscan); + error = xfs_dir2_data_use_free(args, dbp, dup, + args->geo->blksize - size, size, &needlog, &needscan); + if (error) + return error; /* * Initialize the block tail. */ @@ -1110,18 +1118,14 @@ xfs_dir2_sf_to_block( * Add block 0 to the inode. */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); - if (error) { - kmem_free(sfp); - return error; - } + if (error) + goto out_free; /* * Initialize the data block, then convert it to block format. */ error = xfs_dir3_data_init(args, blkno, &bp); - if (error) { - kmem_free(sfp); - return error; - } + if (error) + goto out_free; xfs_dir3_block_init(mp, tp, bp, dp); hdr = bp->b_addr; @@ -1136,8 +1140,10 @@ xfs_dir2_sf_to_block( */ dup = dp->d_ops->data_unused_p(hdr); needlog = needscan = 0; - xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, - i, &needlog, &needscan); + error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); + if (error) + goto out_free; ASSERT(needscan == 0); /* * Fill in the tail. @@ -1150,9 +1156,11 @@ xfs_dir2_sf_to_block( /* * Remove the freespace, we'll manage it. */ - xfs_dir2_data_use_free(args, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), - be16_to_cpu(dup->length), &needlog, &needscan); + error = xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + be16_to_cpu(dup->length), &needlog, &needscan); + if (error) + goto out_free; /* * Create entry for . */ @@ -1256,4 +1264,7 @@ xfs_dir2_sf_to_block( xfs_dir2_block_log_tail(tp, bp); xfs_dir3_data_check(dp, bp); return 0; +out_free: + kmem_free(sfp); + return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 920279485275..cb67ec730b9b 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -932,10 +932,51 @@ xfs_dir2_data_make_free( *needscanp = needscan; } +/* Check our free data for obvious signs of corruption. */ +static inline xfs_failaddr_t +xfs_dir2_data_check_free( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_unused *dup, + xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len) +{ + if (hdr->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC) && + hdr->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC) && + hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) && + hdr->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return __this_address; + if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG) + return __this_address; + if (offset < (char *)dup - (char *)hdr) + return __this_address; + if (offset + len > (char *)dup + be16_to_cpu(dup->length) - (char *)hdr) + return __this_address; + if ((char *)dup - (char *)hdr != + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))) + return __this_address; + return NULL; +} + +/* Sanity-check a new bestfree entry. */ +static inline xfs_failaddr_t +xfs_dir2_data_check_new_free( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_free *dfp, + struct xfs_dir2_data_unused *newdup) +{ + if (dfp == NULL) + return __this_address; + if (dfp->length != newdup->length) + return __this_address; + if (be16_to_cpu(dfp->offset) != (char *)newdup - (char *)hdr) + return __this_address; + return NULL; +} + /* * Take a byte range out of an existing unused space and make it un-free. */ -void +int xfs_dir2_data_use_free( struct xfs_da_args *args, struct xfs_buf *bp, @@ -947,23 +988,19 @@ xfs_dir2_data_use_free( { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ + struct xfs_dir2_data_free *bf; + xfs_failaddr_t fa; int matchback; /* matches end of freespace */ int matchfront; /* matches start of freespace */ int needscan; /* need to regen bestfree */ - xfs_dir2_data_unused_t *newdup; /* new unused entry */ - xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ - struct xfs_dir2_data_free *bf; hdr = bp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); - ASSERT(offset >= (char *)dup - (char *)hdr); - ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); - ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + fa = xfs_dir2_data_check_free(hdr, dup, offset, len); + if (fa) + goto corrupt; /* * Look up the entry in the bestfree table. */ @@ -1008,9 +1045,9 @@ xfs_dir2_data_use_free( xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); - ASSERT(dfp != NULL); - ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup); + if (fa) + goto corrupt; /* * If we got inserted at the last slot, * that means we don't know if there was a better @@ -1036,9 +1073,9 @@ xfs_dir2_data_use_free( xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); - ASSERT(dfp != NULL); - ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup); + if (fa) + goto corrupt; /* * If we got inserted at the last slot, * that means we don't know if there was a better @@ -1084,6 +1121,11 @@ xfs_dir2_data_use_free( } } *needscanp = needscan; + return 0; +corrupt: + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount, + hdr, __FILE__, __LINE__, fa); + return -EFSCORRUPTED; } /* Find the end of the entry data in a data/block format dir block. */ diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d7e630f41f9c..50fc9c0c5e2b 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -877,9 +877,13 @@ xfs_dir2_leaf_addname( /* * Mark the initial part of our freespace in use for the new entry. */ - xfs_dir2_data_use_free(args, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, - &needlog, &needscan); + error = xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + length, &needlog, &needscan); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } /* * Initialize our new entry (at last). */ @@ -1415,7 +1419,8 @@ xfs_dir2_leaf_removename( oldbest = be16_to_cpu(bf[0].length); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - ASSERT(be16_to_cpu(bestsp[db]) == oldbest); + if (be16_to_cpu(bestsp[db]) != oldbest) + return -EFSCORRUPTED; /* * Mark the former data entry unused. */ diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 239d97a64296..9df096cc3c37 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -387,8 +387,9 @@ xfs_dir2_leaf_to_node( dp->d_ops->free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); - ASSERT(be32_to_cpu(ltp->bestcount) <= - (uint)dp->i_d.di_size / args->geo->blksize); + if (be32_to_cpu(ltp->bestcount) > + (uint)dp->i_d.di_size / args->geo->blksize) + return -EFSCORRUPTED; /* * Copy freespace entries from the leaf block to the new block. @@ -1728,6 +1729,7 @@ xfs_dir2_node_addname_int( __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; struct xfs_dir2_data_free *bf; + xfs_dir2_data_aoff_t aoff; dp = args->dp; mp = dp->i_mount; @@ -2022,9 +2024,13 @@ xfs_dir2_node_addname_int( /* * Mark the first part of the unused space, inuse for us. */ - xfs_dir2_data_use_free(args, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, - &needlog, &needscan); + aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, + &needlog, &needscan); + if (error) { + xfs_trans_brelse(tp, dbp); + return error; + } /* * Fill in the new entry and log it. */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1acb584fc5f7..42956d8d95ed 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -803,24 +803,13 @@ typedef struct xfs_agi { &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ (__be32 *)(bp)->b_addr) -/* - * Size of the AGFL. For CRC-enabled filesystes we steal a couple of - * slots in the beginning of the block for a proper header with the - * location information and CRC. - */ -#define XFS_AGFL_SIZE(mp) \ - (((mp)->m_sb.sb_sectsize - \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - sizeof(struct xfs_agfl) : 0)) / \ - sizeof(xfs_agblock_t)) - typedef struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ } __attribute__((packed)) xfs_agfl_t; #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index af197a5f3a82..a2dd7f4a2719 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -93,8 +93,6 @@ __xfs_inobt_alloc_block( int error; /* error return value */ xfs_agblock_t sbno = be32_to_cpu(start->s); - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; @@ -107,17 +105,14 @@ __xfs_inobt_alloc_block( args.resv = resv; error = xfs_alloc_vextent(&args); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } + if (args.fsbno == NULLFSBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } ASSERT(args.len == 1); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4fe17b368316..ef68b1de006a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -93,20 +93,26 @@ xfs_inode_buf_verify( bool readahead) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_agnumber_t agno; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ + agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { int di_ok; xfs_dinode_t *dip; + xfs_agino_t unlinked_ino; dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); + unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - xfs_dinode_good_version(mp, dip->di_version); + xfs_dinode_good_version(mp, dip->di_version) && + (unlinked_ino == NULLAGINO || + xfs_verify_agino(mp, agno, unlinked_ino)); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -115,16 +121,18 @@ xfs_inode_buf_verify( return; } - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", (unsigned long long)bp->b_bn, i, be16_to_cpu(dip->di_magic)); #endif + xfs_buf_verifier_error(bp, -EFSCORRUPTED, + __func__, dip, sizeof(*dip), + NULL); + return; } } - xfs_inobp_check(mp, bp); } @@ -564,10 +572,7 @@ xfs_iread( /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); VFS_I(ip)->i_generation = prandom_u32(); - if (xfs_sb_version_hascrc(&mp->m_sb)) - ip->i_d.di_version = 3; - else - ip->i_d.di_version = 2; + ip->i_d.di_version = 3; return 0; } @@ -649,3 +654,108 @@ xfs_iread( xfs_trans_brelse(tp, bp); return error; } + +/* + * Validate di_extsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_extsize(). + * These functions must be kept in sync with each other. + */ +xfs_failaddr_t +xfs_inode_validate_extsize( + struct xfs_mount *mp, + uint32_t extsize, + uint16_t mode, + uint16_t flags) +{ + bool rt_flag; + bool hint_flag; + bool inherit_flag; + uint32_t extsize_bytes; + uint32_t blocksize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags & XFS_DIFLAG_EXTSIZE); + inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); + extsize_bytes = XFS_FSB_TO_B(mp, extsize); + + if (rt_flag) + blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + else + blocksize_bytes = mp->m_sb.sb_blocksize; + + if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) + return __this_address; + + if (hint_flag && !S_ISREG(mode)) + return __this_address; + + if (inherit_flag && !S_ISDIR(mode)) + return __this_address; + + if ((hint_flag || inherit_flag) && extsize == 0) + return __this_address; + + if (!(hint_flag || inherit_flag) && extsize != 0) + return __this_address; + + if (extsize_bytes % blocksize_bytes) + return __this_address; + + if (extsize > MAXEXTLEN) + return __this_address; + + if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) + return __this_address; + + return NULL; +} + +/* + * Validate di_cowextsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). + * These functions must be kept in sync with each other. + */ +xfs_failaddr_t +xfs_inode_validate_cowextsize( + struct xfs_mount *mp, + uint32_t cowextsize, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + bool rt_flag; + bool hint_flag; + uint32_t cowextsize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); + cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); + + if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + return __this_address; + + if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) + return __this_address; + + if (hint_flag && cowextsize == 0) + return __this_address; + + if (!hint_flag && cowextsize != 0) + return __this_address; + + if (hint_flag && rt_flag) + return __this_address; + + if (cowextsize_bytes % mp->m_sb.sb_blocksize) + return __this_address; + + if (cowextsize > MAXEXTLEN) + return __this_address; + + if (cowextsize > mp->m_sb.sb_agblocks / 2) + return __this_address; + + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 8a5e1da52d74..d9a376a78ee2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -84,5 +84,10 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); +xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, + uint32_t extsize, uint16_t mode, uint16_t flags); +xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, + uint32_t cowextsize, uint16_t mode, uint16_t flags, + uint64_t flags2); #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 866d2861c625..701c42a28d05 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -195,8 +195,9 @@ xfs_iformat_local( "corrupt inode %Lu (bad size %d for local fork, size = %d).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); - XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_local", dip, sizeof(*dip), + __this_address); return -EFSCORRUPTED; } @@ -231,8 +232,9 @@ xfs_iformat_extents( if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", (unsigned long long) ip->i_ino, nex); - XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - mp, dip); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_extents(1)", dip, sizeof(*dip), + __this_address); return -EFSCORRUPTED; } @@ -245,10 +247,14 @@ xfs_iformat_extents( xfs_iext_first(ifp, &icur); for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + xfs_bmbt_disk_get_all(dp, &new); - if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, mp); + fa = xfs_bmap_validate_extent(ip, whichfork, &new); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_extents(2)", + dp, sizeof(*dp), fa); return -EFSCORRUPTED; } @@ -305,8 +311,9 @@ xfs_iformat_btree( level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - mp, dip); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_btree", dfp, size, + __this_address); return -EFSCORRUPTED; } @@ -595,7 +602,7 @@ xfs_iextents_copy( for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) continue; - ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec)); + ASSERT(xfs_bmap_validate_extent(ip, whichfork, &rec) == NULL); xfs_bmbt_disk_set_all(dp, &rec); trace_xfs_write_extent(ip, &icur, state, _RET_IP_); copied += sizeof(struct xfs_bmbt_rec); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 8479769e470d..265fdcefcbae 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -79,8 +79,6 @@ xfs_refcountbt_alloc_block( struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; @@ -98,7 +96,6 @@ xfs_refcountbt_alloc_block( trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -109,12 +106,10 @@ xfs_refcountbt_alloc_block( be32_add_cpu(&agf->agf_refcount_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out_error: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index e829c3e489ea..8b0d0de1cd11 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -104,20 +104,15 @@ xfs_rmapbt_alloc_block( int error; xfs_agblock_t bno; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* Allocate the new block from the freelist. If we can't, give up. */ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, &bno, 1); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, bno, 1); if (bno == NULLAGBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -130,7 +125,8 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + *stat = 1; return 0; } @@ -158,6 +154,8 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); + return 0; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a55f7a45fa78..53433cc024fd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -731,7 +731,6 @@ xfs_sb_mount_common( struct xfs_sb *sbp) { mp->m_agfrotor = mp->m_agirotor = 0; - spin_lock_init(&mp->m_agirotor_lock); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index fd975524f460..018aabbd9394 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -80,7 +80,7 @@ xfs_scrub_walk_agfl( } /* first to the end */ - for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) { + for (i = flfirst; i < xfs_agfl_size(mp); i++) { error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); if (error) return error; @@ -664,7 +664,7 @@ xfs_scrub_agf( if (agfl_last > agfl_first) fl_count = agfl_last - agfl_first + 1; else - fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1; + fl_count = xfs_agfl_size(mp) - agfl_first + agfl_last + 1; if (agfl_count != 0 && fl_count != agfl_count) xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); @@ -767,7 +767,7 @@ int xfs_scrub_agfl( struct xfs_scrub_context *sc) { - struct xfs_scrub_agfl_info sai = { 0 }; + struct xfs_scrub_agfl_info sai; struct xfs_agf *agf; xfs_agnumber_t agno; unsigned int agflcount; @@ -791,10 +791,11 @@ xfs_scrub_agfl( /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); agflcount = be32_to_cpu(agf->agf_flcount); - if (agflcount > XFS_AGFL_SIZE(sc->mp)) { + if (agflcount > xfs_agfl_size(sc->mp)) { xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } + memset(&sai, 0, sizeof(sai)); sai.sz_entries = agflcount; sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); if (!sai.entries) { diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 4ed80474f545..127575f0abfb 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -98,7 +98,7 @@ xfs_scrub_xattr_listent( if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ - xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL); + xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino); return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index d00282130492..639d14b51e90 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -37,6 +37,7 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rmap.h" +#include "xfs_rmap_btree.h" #include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" @@ -423,6 +424,169 @@ xfs_scrub_bmap_btree( return error; } +struct xfs_scrub_bmap_check_rmap_info { + struct xfs_scrub_context *sc; + int whichfork; + struct xfs_iext_cursor icur; +}; + +/* Can we find bmaps that fit this rmap? */ +STATIC int +xfs_scrub_bmap_check_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_bmbt_irec irec; + struct xfs_scrub_bmap_check_rmap_info *sbcri = priv; + struct xfs_ifork *ifp; + struct xfs_scrub_context *sc = sbcri->sc; + bool have_map; + + /* Is this even the right fork? */ + if (rec->rm_owner != sc->ip->i_ino) + return 0; + if ((sbcri->whichfork == XFS_ATTR_FORK) ^ + !!(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + return 0; + + /* Now look up the bmbt record. */ + ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork); + if (!ifp) { + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + goto out; + } + have_map = xfs_iext_lookup_extent(sc->ip, ifp, rec->rm_offset, + &sbcri->icur, &irec); + if (!have_map) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + /* + * bmap extent record lengths are constrained to 2^21 blocks in length + * because of space constraints in the on-disk metadata structure. + * However, rmap extent record lengths are constrained only by AG + * length, so we have to loop through the bmbt to make sure that the + * entire rmap is covered by bmbt records. + */ + while (have_map) { + if (irec.br_startoff != rec->rm_offset) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, + cur->bc_private.a.agno, rec->rm_startblock)) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + if (irec.br_blockcount > rec->rm_blockcount) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + break; + rec->rm_startblock += irec.br_blockcount; + rec->rm_offset += irec.br_blockcount; + rec->rm_blockcount -= irec.br_blockcount; + if (rec->rm_blockcount == 0) + break; + have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec); + if (!have_map) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + } + +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return XFS_BTREE_QUERY_RANGE_ABORT; + return 0; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xfs_scrub_bmap_check_ag_rmaps( + struct xfs_scrub_context *sc, + int whichfork, + xfs_agnumber_t agno) +{ + struct xfs_scrub_bmap_check_rmap_info sbcri; + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno); + if (!cur) { + error = -ENOMEM; + goto out_agf; + } + + sbcri.sc = sc; + sbcri.whichfork = whichfork; + error = xfs_rmap_query_all(cur, xfs_scrub_bmap_check_rmap, &sbcri); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); +out_agf: + xfs_trans_brelse(sc->tp, agf); + return error; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xfs_scrub_bmap_check_rmaps( + struct xfs_scrub_context *sc, + int whichfork) +{ + loff_t size; + xfs_agnumber_t agno; + int error; + + if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || + whichfork == XFS_COW_FORK || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return 0; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) + return 0; + + /* + * Only do this for complex maps that are in btree format, or for + * situations where we would seem to have a size but zero extents. + * The inode repair code can zap broken iforks, which means we have + * to flag this bmap as corrupt if there are rmaps that need to be + * reattached. + */ + switch (whichfork) { + case XFS_DATA_FORK: + size = i_size_read(VFS_I(sc->ip)); + break; + case XFS_ATTR_FORK: + size = XFS_IFORK_Q(sc->ip); + break; + default: + size = 0; + break; + } + if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE && + (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0)) + return 0; + + for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { + error = xfs_scrub_bmap_check_ag_rmaps(sc, whichfork, agno); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + break; + } + + return 0; +} + /* * Scrub an inode fork's block mappings. * @@ -457,16 +621,16 @@ xfs_scrub_bmap( goto out; /* No CoW forks on non-reflink inodes/filesystems. */ if (!xfs_is_reflink_inode(ip)) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); goto out; } break; case XFS_ATTR_FORK: if (!ifp) - goto out; + goto out_check_rmap; if (!xfs_sb_version_hasattr(&mp->m_sb) && !xfs_sb_version_hasattr2(&mp->m_sb)) - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); break; default: ASSERT(whichfork == XFS_DATA_FORK); @@ -534,6 +698,10 @@ xfs_scrub_bmap( goto out; } +out_check_rmap: + error = xfs_scrub_bmap_check_rmaps(sc, whichfork); + if (!xfs_scrub_fblock_xref_process_error(sc, whichfork, 0, &error)) + goto out; out: return error; } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8033ab9d8f47..8ed91d5c868d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -213,12 +213,10 @@ xfs_scrub_block_set_preen( void xfs_scrub_ino_set_preen( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; - trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0, - __return_address); + trace_xfs_scrub_ino_preen(sc, ino, __return_address); } /* Record a corrupt block. */ @@ -249,22 +247,20 @@ xfs_scrub_block_xref_set_corrupt( void xfs_scrub_ino_set_corrupt( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); + trace_xfs_scrub_ino_error(sc, ino, __return_address); } /* Record a corruption while cross-referencing with an inode. */ void xfs_scrub_ino_xref_set_corrupt( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; - trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); + trace_xfs_scrub_ino_error(sc, ino, __return_address); } /* Record corruption in a block indexed by a file fork. */ @@ -296,12 +292,10 @@ xfs_scrub_fblock_xref_set_corrupt( void xfs_scrub_ino_set_warning( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; - trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0, - __return_address); + trace_xfs_scrub_ino_warning(sc, ino, __return_address); } /* Warn about a block indexed by a file fork that needs review. */ @@ -619,7 +613,7 @@ xfs_scrub_checkpoint_log( { int error; - error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + error = xfs_log_force(mp, XFS_LOG_SYNC); if (error) return error; xfs_ail_push_all_sync(mp->m_ail); diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index ddb65d22c76a..deaf60400981 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -63,25 +63,22 @@ bool xfs_scrub_fblock_xref_process_error(struct xfs_scrub_context *sc, void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc, struct xfs_buf *bp); -void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino); void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc, struct xfs_buf *bp); -void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino); void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); void xfs_scrub_block_xref_set_corrupt(struct xfs_scrub_context *sc, struct xfs_buf *bp); -void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, + xfs_ino_t ino); void xfs_scrub_fblock_xref_set_corrupt(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); -void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino); void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 50b6a26b0299..38f29806eb54 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -781,7 +781,7 @@ xfs_scrub_directory( /* Plausible size? */ if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); goto out; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 63ab3f98430d..106ca4bd753f 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -259,7 +259,8 @@ xfs_scrub_iallocbt_check_freemask( error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &bp, 0, 0); - if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error)) + if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur, 0, + &error)) continue; /* Which inodes are free? */ @@ -433,7 +434,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes( if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != inode_blocks) - xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0); + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } /* Scrub the inode btrees for some AG. */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 21297bef8df1..df14930e4fc5 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -89,67 +89,21 @@ out: /* Inode core */ -/* - * Validate di_extsize hint. - * - * The rules are documented at xfs_ioctl_setattr_check_extsize(). - * These functions must be kept in sync with each other. - */ +/* Validate di_extsize hint. */ STATIC void xfs_scrub_inode_extsize( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, uint16_t flags) { - struct xfs_mount *mp = sc->mp; - bool rt_flag; - bool hint_flag; - bool inherit_flag; - uint32_t extsize; - uint32_t extsize_bytes; - uint32_t blocksize_bytes; - - rt_flag = (flags & XFS_DIFLAG_REALTIME); - hint_flag = (flags & XFS_DIFLAG_EXTSIZE); - inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); - extsize = be32_to_cpu(dip->di_extsize); - extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); - - if (rt_flag) - blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; - else - blocksize_bytes = mp->m_sb.sb_blocksize; - - if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) - goto bad; - - if (hint_flag && !S_ISREG(mode)) - goto bad; - - if (inherit_flag && !S_ISDIR(mode)) - goto bad; - - if ((hint_flag || inherit_flag) && extsize == 0) - goto bad; - - if (!(hint_flag || inherit_flag) && extsize != 0) - goto bad; - - if (extsize_bytes % blocksize_bytes) - goto bad; - - if (extsize > MAXEXTLEN) - goto bad; + xfs_failaddr_t fa; - if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) - goto bad; - - return; -bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) + xfs_scrub_ino_set_corrupt(sc, ino); } /* @@ -161,58 +115,25 @@ bad: STATIC void xfs_scrub_inode_cowextsize( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, uint16_t flags, uint64_t flags2) { - struct xfs_mount *mp = sc->mp; - bool rt_flag; - bool hint_flag; - uint32_t extsize; - uint32_t extsize_bytes; - - rt_flag = (flags & XFS_DIFLAG_REALTIME); - hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); - extsize = be32_to_cpu(dip->di_cowextsize); - extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); - - if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) - goto bad; - - if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) - goto bad; - - if (hint_flag && extsize == 0) - goto bad; - - if (!hint_flag && extsize != 0) - goto bad; - - if (hint_flag && rt_flag) - goto bad; - - if (extsize_bytes % mp->m_sb.sb_blocksize) - goto bad; - - if (extsize > MAXEXTLEN) - goto bad; - - if (extsize > mp->m_sb.sb_agblocks / 2) - goto bad; + xfs_failaddr_t fa; - return; -bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + fa = xfs_inode_validate_cowextsize(sc->mp, + be32_to_cpu(dip->di_cowextsize), mode, flags, + flags2); + if (fa) + xfs_scrub_ino_set_corrupt(sc, ino); } /* Make sure the di_flags make sense for the inode. */ STATIC void xfs_scrub_inode_flags( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, @@ -251,14 +172,13 @@ xfs_scrub_inode_flags( return; bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Make sure the di_flags2 make sense for the inode. */ STATIC void xfs_scrub_inode_flags2( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, @@ -295,14 +215,13 @@ xfs_scrub_inode_flags2( return; bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Scrub all the ondisk inode fields. */ STATIC void xfs_scrub_dinode( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino) { @@ -333,7 +252,7 @@ xfs_scrub_dinode( /* mode is recognized */ break; default: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; } @@ -344,22 +263,22 @@ xfs_scrub_dinode( * We autoconvert v1 inodes into v2 inodes on writeout, * so just mark this inode for preening. */ - xfs_scrub_ino_set_preen(sc, ino, bp); + xfs_scrub_ino_set_preen(sc, ino); break; case 2: case 3: if (dip->di_onlink != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_mode == 0 && sc->ip) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_projid_hi != 0 && !xfs_sb_version_hasprojid32bit(&mp->m_sb)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; default: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); return; } @@ -369,40 +288,40 @@ xfs_scrub_dinode( */ if (dip->di_uid == cpu_to_be32(-1U) || dip->di_gid == cpu_to_be32(-1U)) - xfs_scrub_ino_set_warning(sc, ino, bp); + xfs_scrub_ino_set_warning(sc, ino); /* di_format */ switch (dip->di_format) { case XFS_DINODE_FMT_DEV: if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode) && !S_ISSOCK(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_LOCAL: if (!S_ISDIR(mode) && !S_ISLNK(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_EXTENTS: if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: if (!S_ISREG(mode) && !S_ISDIR(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_UUID: default: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; } /* di_[amc]time.nsec */ if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* * di_size. xfs_dinode_verify checks for things that screw up @@ -411,19 +330,19 @@ xfs_scrub_dinode( */ isize = be64_to_cpu(dip->di_size); if (isize & (1ULL << 63)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* Devices, fifos, and sockets must have zero size */ if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* Directories can't be larger than the data section size (32G) */ if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* Symlinks can't be larger than SYMLINK_MAXLEN */ if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* * Warn if the running kernel can't handle the kinds of offsets @@ -432,7 +351,7 @@ xfs_scrub_dinode( * overly large offsets, flag the inode for admin review. */ if (isize >= mp->m_super->s_maxbytes) - xfs_scrub_ino_set_warning(sc, ino, bp); + xfs_scrub_ino_set_warning(sc, ino); /* di_nblocks */ if (flags2 & XFS_DIFLAG2_REFLINK) { @@ -447,15 +366,15 @@ xfs_scrub_dinode( */ if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } else { if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } - xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags); + xfs_scrub_inode_flags(sc, dip, ino, mode, flags); - xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags); + xfs_scrub_inode_extsize(sc, dip, ino, mode, flags); /* di_nextents */ nextents = be32_to_cpu(dip->di_nextents); @@ -463,31 +382,31 @@ xfs_scrub_dinode( switch (dip->di_format) { case XFS_DINODE_FMT_EXTENTS: if (nextents > fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: if (nextents <= fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; default: if (nextents != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; } /* di_forkoff */ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_anextents != 0 && dip->di_forkoff == 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* di_aformat */ if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && dip->di_aformat != XFS_DINODE_FMT_EXTENTS && dip->di_aformat != XFS_DINODE_FMT_BTREE) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* di_anextents */ nextents = be16_to_cpu(dip->di_anextents); @@ -495,92 +414,26 @@ xfs_scrub_dinode( switch (dip->di_aformat) { case XFS_DINODE_FMT_EXTENTS: if (nextents > fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: if (nextents <= fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; default: if (nextents != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } if (dip->di_version >= 3) { if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); - xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2); - xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags, + xfs_scrub_ino_set_corrupt(sc, ino); + xfs_scrub_inode_flags2(sc, dip, ino, mode, flags, flags2); + xfs_scrub_inode_cowextsize(sc, dip, ino, mode, flags, flags2); } } -/* Map and read a raw inode. */ -STATIC int -xfs_scrub_inode_map_raw( - struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf **bpp, - struct xfs_dinode **dipp) -{ - struct xfs_imap imap; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); - if (error == -EINVAL) { - /* - * Inode could have gotten deleted out from under us; - * just forget about it. - */ - error = -ENOENT; - goto out; - } - if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), - XFS_INO_TO_AGBNO(mp, ino), &error)) - goto out; - - error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, - NULL); - if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), - XFS_INO_TO_AGBNO(mp, ino), &error)) - goto out; - - /* - * Is this really an inode? We disabled verifiers in the above - * xfs_trans_read_buf call because the inode buffer verifier - * fails on /any/ inode record in the inode cluster with a bad - * magic or version number, not just the one that we're - * checking. Therefore, grab the buffer unconditionally, attach - * the inode verifiers by hand, and run the inode verifier only - * on the one inode we want. - */ - bp->b_ops = &xfs_inode_buf_ops; - dip = xfs_buf_offset(bp, imap.im_boffset); - if (xfs_dinode_verify(mp, ino, dip) != NULL || - !xfs_dinode_good_version(mp, dip->di_version)) { - xfs_scrub_ino_set_corrupt(sc, ino, bp); - goto out_buf; - } - - /* ...and is it the one we asked for? */ - if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { - error = -ENOENT; - goto out_buf; - } - - *dipp = dip; - *bpp = bp; -out: - return error; -out_buf: - xfs_trans_brelse(sc->tp, bp); - return error; -} - /* * Make sure the finobt doesn't think this inode is free. * We don't have to check the inobt ourselves because we got the inode via @@ -645,18 +498,18 @@ xfs_scrub_inode_xref_bmap( if (!xfs_scrub_should_check_xref(sc, &error, NULL)) return; if (nextents < be32_to_cpu(dip->di_nextents)) - xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino); error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, &nextents, &acount); if (!xfs_scrub_should_check_xref(sc, &error, NULL)) return; if (nextents != be16_to_cpu(dip->di_anextents)) - xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino); /* Check nblocks against the inode. */ if (count + acount != be64_to_cpu(dip->di_nblocks)) - xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino); } /* Cross-reference with the other btrees. */ @@ -700,8 +553,7 @@ xfs_scrub_inode_xref( static void xfs_scrub_inode_check_reflink_iflag( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { struct xfs_mount *mp = sc->mp; bool has_shared; @@ -716,9 +568,9 @@ xfs_scrub_inode_check_reflink_iflag( XFS_INO_TO_AGBNO(mp, ino), &error)) return; if (xfs_is_reflink_inode(sc->ip) && !has_shared) - xfs_scrub_ino_set_preen(sc, ino, bp); + xfs_scrub_ino_set_preen(sc, ino); else if (!xfs_is_reflink_inode(sc->ip) && has_shared) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Scrub an inode. */ @@ -727,43 +579,33 @@ xfs_scrub_inode( struct xfs_scrub_context *sc) { struct xfs_dinode di; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - xfs_ino_t ino; int error = 0; - /* Did we get the in-core inode, or are we doing this manually? */ - if (sc->ip) { - ino = sc->ip->i_ino; - xfs_inode_to_disk(sc->ip, &di, 0); - dip = &di; - } else { - /* Map & read inode. */ - ino = sc->sm->sm_ino; - error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip); - if (error || !bp) - goto out; + /* + * If sc->ip is NULL, that means that the setup function called + * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED + * and a NULL inode, so flag the corruption error and return. + */ + if (!sc->ip) { + xfs_scrub_ino_set_corrupt(sc, sc->sm->sm_ino); + return 0; } - xfs_scrub_dinode(sc, bp, dip, ino); + /* Scrub the inode core. */ + xfs_inode_to_disk(sc->ip, &di, 0); + xfs_scrub_dinode(sc, &di, sc->ip->i_ino); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; - /* Now let's do the things that require a live inode. */ - if (!sc->ip) - goto out; - /* * Look for discrepancies between file's data blocks and the reflink * iflag. We already checked the iflag against the file mode when * we scrubbed the dinode. */ if (S_ISREG(VFS_I(sc->ip)->i_mode)) - xfs_scrub_inode_check_reflink_iflag(sc, ino, bp); + xfs_scrub_inode_check_reflink_iflag(sc, sc->ip->i_ino); - xfs_scrub_inode_xref(sc, ino, dip); + xfs_scrub_inode_xref(sc, sc->ip->i_ino, &di); out: - if (bp) - xfs_trans_brelse(sc->tp, bp); return error; } diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 0d3851410c74..1fb88c18d455 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -167,8 +167,18 @@ xfs_scrub_parent_validate( * if the parent pointer erroneously points to a file, we * can't use DONTCACHE here because DONTCACHE inodes can trigger * immediate inactive cleanup of the inode. + * + * If _iget returns -EINVAL then the parent inode number is garbage + * and the directory is corrupt. If the _iget returns -EFSCORRUPTED + * or -EFSBADCRC then the parent is corrupt which is a cross + * referencing error. Any other error is an operational error. */ - error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp); + error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp); + if (error == -EINVAL) { + error = -EFSCORRUPTED; + xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + goto out; + } if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 51daa4ae2627..6ba465e6c885 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -219,7 +219,7 @@ xfs_scrub_quota( /* Look for problem extents. */ xfs_ilock(ip, XFS_ILOCK_EXCL); if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); goto out_unlock_inode; } max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 26390991369a..39c41dfe08ee 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -116,8 +116,7 @@ xfs_scrub_xref_is_used_rt_space( if (!xfs_scrub_should_check_xref(sc, &error, NULL)) goto out_unlock; if (is_free) - xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino, - NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino); out_unlock: xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4dc896852bf0..5d2b1c241be5 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -174,53 +174,32 @@ DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen); DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class, - TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr, - void *ret_ip), - TP_ARGS(sc, ino, daddr, ret_ip), + TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, void *ret_ip), + TP_ARGS(sc, ino, ret_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(unsigned int, type) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, bno) __field(void *, ret_ip) ), TP_fast_assign( - xfs_fsblock_t fsbno; - xfs_agnumber_t agno; - xfs_agblock_t bno; - - if (daddr) { - fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); - bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - } else { - agno = XFS_INO_TO_AGNO(sc->mp, ino); - bno = XFS_AGINO_TO_AGBNO(sc->mp, - XFS_INO_TO_AGINO(sc->mp, ino)); - } - __entry->dev = sc->mp->m_super->s_dev; __entry->ino = ino; __entry->type = sc->sm->sm_type; - __entry->agno = agno; - __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx type %u agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->type, - __entry->agno, - __entry->bno, __entry->ret_ip) ) #define DEFINE_SCRUB_INO_ERROR_EVENT(name) \ DEFINE_EVENT(xfs_scrub_ino_error_class, name, \ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \ - xfs_daddr_t daddr, void *ret_ip), \ - TP_ARGS(sc, ino, daddr, ret_ip)) + void *ret_ip), \ + TP_ARGS(sc, ino, ret_ip)) DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error); DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9c6a830da0ee..31f1f10eecd1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -209,7 +209,8 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, + XFS_TRANS_NOFS, &tp); if (error) return error; @@ -1330,21 +1331,20 @@ xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, 0); if (error) goto out_unlock; - - if (nimaps) { - trace_xfs_get_blocks_found(ip, offset, size, - imap.br_state == XFS_EXT_UNWRITTEN ? - XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); - xfs_iunlock(ip, lockmode); - } else { + if (!nimaps) { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } + trace_xfs_get_blocks_found(ip, offset, size, + imap.br_state == XFS_EXT_UNWRITTEN ? + XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); + xfs_iunlock(ip, lockmode); + /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); @@ -1390,7 +1390,7 @@ xfs_vm_bmap( /* * The swap code (ab-)uses ->bmap to get a block mapping and then - * bypasseÑ• the file system for actual I/O. We really can't allow + * bypasses the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error. * diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c83f549dc17b..05dee8fdd895 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1208,18 +1208,15 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any - * partial block at the beginning and/or end. xfs_zero_range is - * smart enough to skip any holes, including those we just created, - * but we must take care not to zero beyond EOF and enlarge i_size. + * partial block at the beginning and/or end. iomap_zero_range is smart + * enough to skip any holes, including those we just created, but we + * must take care not to zero beyond EOF and enlarge i_size. */ - if (offset >= XFS_ISIZE(ip)) return 0; - if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - - return xfs_zero_range(ip, offset, len, NULL); + return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); } /* @@ -1899,17 +1896,28 @@ xfs_swap_extents( * performed with log redo items! */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + int w = XFS_DATA_FORK; + uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w); + uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w); + + /* + * Conceptually this shouldn't affect the shape of either bmbt, + * but since we atomically move extents one by one, we reserve + * enough space to rebuild both trees. + */ + resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w); + resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); + /* - * Conceptually this shouldn't affect the shape of either - * bmbt, but since we atomically move extents one by one, - * we reserve enough space to rebuild both trees. + * Handle the corner case where either inode might straddle the + * btree format boundary. If so, the inode could bounce between + * btree <-> extent format on unmap -> remap cycles, freeing and + * allocating a bmapbt block each time. */ - resblks = XFS_SWAP_RMAP_SPACE_RES(mp, - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), - XFS_DATA_FORK) + - XFS_SWAP_RMAP_SPACE_RES(mp, - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), - XFS_DATA_FORK); + if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) + resblks += XFS_IFORK_MAXEXT(ip, w); + if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) + resblks += XFS_IFORK_MAXEXT(tip, w); } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) @@ -2003,11 +2011,11 @@ xfs_swap_extents( ip->i_cowfp = tip->i_cowfp; tip->i_cowfp = cowfp; - if (ip->i_cowfp && ip->i_cnextents) + if (ip->i_cowfp && ip->i_cowfp->if_bytes) xfs_inode_set_cowblocks_tag(ip); else xfs_inode_clear_cowblocks_tag(ip); - if (tip->i_cowfp && tip->i_cnextents) + if (tip->i_cowfp && tip->i_cowfp->if_bytes) xfs_inode_set_cowblocks_tag(tip); else xfs_inode_clear_cowblocks_tag(tip); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d1da2ee9e6db..ac669a10c62f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1708,7 +1708,7 @@ xfs_buftarg_isolate( * zero. If the value is already zero, we need to reclaim the * buffer, otherwise it gets another trip through the LRU. */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { spin_unlock(&bp->b_lock); return LRU_ROTATE; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 270ddb4d2313..82ad270e390e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -460,7 +460,7 @@ xfs_buf_item_unpin( list_del_init(&bp->b_li_list); bp->b_iodone = NULL; } else { - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); @@ -1057,12 +1057,12 @@ xfs_buf_do_callbacks_fail( lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, li_bio_list); ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_ops->iop_error) lip->li_ops->iop_error(lip, bp); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } static bool @@ -1226,7 +1226,7 @@ xfs_buf_iodone( * * Either way, AIL is useless if we're forcing a shutdown. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } @@ -1246,7 +1246,7 @@ xfs_buf_resubmit_failed_buffers( /* * Clear XFS_LI_FAILED flag from all items before resubmit * - * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 43572f8a1b8e..a7daef9e16bf 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -394,8 +394,6 @@ xfs_qm_dqalloc( error1: xfs_defer_cancel(&dfops); error0: - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return error; } @@ -920,7 +918,7 @@ xfs_qm_dqflush_done( (lip->li_flags & XFS_LI_FAILED))) { /* xfs_trans_ail_delete() drops the AIL lock. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); } else { @@ -930,7 +928,7 @@ xfs_qm_dqflush_done( */ if (lip->li_flags & XFS_LI_FAILED) xfs_clear_li_failed(lip); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 96eaa6933709..4b331e354da7 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -157,8 +157,9 @@ xfs_dquot_item_error( STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, - struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) - __acquires(&lip->li_ailp->xa_lock) + struct list_head *buffer_list) + __releases(&lip->li_ailp->ail_lock) + __acquires(&lip->li_ailp->ail_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_buf *bp = lip->li_buf; @@ -205,7 +206,7 @@ xfs_qm_dquot_logitem_push( goto out_unlock; } - spin_unlock(&lip->li_ailp->xa_lock); + spin_unlock(&lip->li_ailp->ail_lock); error = xfs_qm_dqflush(dqp, &bp); if (error) { @@ -217,7 +218,7 @@ xfs_qm_dquot_logitem_push( xfs_buf_relse(bp); } - spin_lock(&lip->li_ailp->xa_lock); + spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_dqunlock(dqp); return rval; @@ -400,7 +401,7 @@ xfs_qm_qoffend_logitem_committed( * Delete the qoff-start logitem from the AIL. * xfs_trans_ail_delete() drops the AIL lock. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); kmem_free(qfs->qql_item.li_lv_shadow); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ccf520f0b00d..a63f5083f497 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -347,27 +347,32 @@ xfs_corruption_error( * values, and omit the stack trace unless the error level is tuned high. */ void -xfs_verifier_error( +xfs_buf_verifier_error( struct xfs_buf *bp, int error, + const char *name, + void *buf, + size_t bufsz, xfs_failaddr_t failaddr) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; + int sz; fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - fa, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn, name); xfs_alert(mp, "Unmount and run xfs_repair"); if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz); xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", - XFS_CORRUPTION_DUMP_LEN); - xfs_hex_dump(xfs_buf_offset(bp, 0), XFS_CORRUPTION_DUMP_LEN); + sz); + xfs_hex_dump(buf, sz); } if (xfs_error_level >= XFS_ERRLEVEL_HIGH) @@ -375,6 +380,20 @@ xfs_verifier_error( } /* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp, + int error, + xfs_failaddr_t failaddr) +{ + return xfs_buf_verifier_error(bp, error, "", xfs_buf_offset(bp, 0), + XFS_CORRUPTION_DUMP_LEN, failaddr); +} + +/* * Warnings for inode corruption problems. Don't bother with the stack * trace unless the error level is turned up high. */ diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7e728c5a46b8..ce391349e78b 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -26,6 +26,9 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, int linenum, xfs_failaddr_t failaddr); +extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, + const char *name, void *buf, size_t bufsz, + xfs_failaddr_t failaddr); extern void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index fe1bfee35898..eed698aa9f16 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -122,7 +122,7 @@ xfs_nfs_get_inode( struct super_block *sb, u64 ino, u32 generation) - { +{ xfs_mount_t *mp = XFS_M(sb); xfs_inode_t *ip; int error; @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 77760dbf0242..13e3d1a69e76 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -611,10 +611,9 @@ xfs_extent_busy_flush( unsigned busy_gen) { DEFINE_WAIT (wait); - int log_flushed = 0, error; + int error; - trace_xfs_log_force(mp, 0, _THIS_IP_); - error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); + error = xfs_log_force(mp, XFS_LOG_SYNC); if (error) return; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9ea08326f876..299aee4b7b0b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,20 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; -/* - * Clear the specified ranges to zero through either the pagecache or DAX. - * Holes and unwritten extents will be left as-is as they already are zeroed. - */ -int -xfs_zero_range( - struct xfs_inode *ip, - xfs_off_t pos, - xfs_off_t count, - bool *did_zero) -{ - return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); -} - int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -122,7 +108,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -182,7 +168,7 @@ xfs_file_fsync( } if (lsn) { - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); ip->i_itemp->ili_fsync_fields = 0; } xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -301,31 +287,6 @@ xfs_file_read_iter( } /* - * Zero any on disk space between the current EOF and the new, larger EOF. - * - * This handles the normal case of zeroing the remainder of the last block in - * the file and the unusual case of zeroing blocks out beyond the size of the - * file. This second case only happens with fixed size extents and when the - * system crashes before the inode size was updated but after blocks were - * allocated. - * - * Expects the iolock to be held exclusive, and will take the ilock internally. - */ -int /* error (positive) */ -xfs_zero_eof( - struct xfs_inode *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize, /* current inode size */ - bool *did_zeroing) -{ - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); - - trace_xfs_zero_eof(ip, isize, offset - isize); - return xfs_zero_range(ip, isize, offset - isize, did_zeroing); -} - -/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to @@ -344,6 +305,7 @@ xfs_file_aio_write_checks( ssize_t error = 0; size_t count = iov_iter_count(from); bool drained_dio = false; + loff_t isize; restart: error = generic_write_checks(iocb, from); @@ -380,7 +342,8 @@ restart: * and hence be able to correctly determine if we need to run zeroing. */ spin_lock(&ip->i_flags_lock); - if (iocb->ki_pos > i_size_read(inode)) { + isize = i_size_read(inode); + if (iocb->ki_pos > isize) { spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { @@ -401,7 +364,10 @@ restart: drained_dio = true; goto restart; } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, + NULL, &xfs_iomap_ops); if (error) return error; } else diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8b4545623e25..523792768080 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -217,7 +217,7 @@ xfs_growfs_data_private( } agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); - for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d53a316162d6..9a18f69f6e96 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -483,7 +483,28 @@ xfs_iget_cache_miss( trace_xfs_iget_miss(ip); - if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { + + /* + * If we are allocating a new inode, then check what was returned is + * actually a free, empty inode. If we are not allocating an inode, + * the check we didn't find a free inode. + */ + if (flags & XFS_IGET_CREATE) { + if (VFS_I(ip)->i_mode != 0) { + xfs_warn(mp, +"Corruption detected! Free inode 0x%llx not marked free on disk", + ino); + error = -EFSCORRUPTED; + goto out_destroy; + } + if (ip->i_d.di_nblocks != 0) { + xfs_warn(mp, +"Corruption detected! Free inode 0x%llx has blocks allocated!", + ino); + error = -EFSCORRUPTED; + goto out_destroy; + } + } else if (VFS_I(ip)->i_mode == 0) { error = -ENOENT; goto out_destroy; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 604ee384a00a..3e3aab3888fa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1872,6 +1872,7 @@ xfs_inactive( xfs_inode_t *ip) { struct xfs_mount *mp; + struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); int error; int truncate = 0; @@ -1892,6 +1893,10 @@ xfs_inactive( if (mp->m_flags & XFS_MOUNT_RDONLY) return; + /* Try to clean out the cow blocks if there are any. */ + if (xfs_is_reflink_inode(ip) && cow_ifp->if_bytes > 0) + xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (VFS_I(ip)->i_nlink != 0) { /* * force is true because we are evicting an inode from the @@ -2470,6 +2475,10 @@ xfs_ifree( ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + /* Don't attempt to replay owner changes for a deleted inode */ + ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + /* * Bump the generation count so no one will be confused * by reincarnations of this inode. @@ -2497,7 +2506,7 @@ xfs_iunpin( trace_xfs_inode_unpin_nowait(ip, _RET_IP_); /* Give the log a push to start the unpinning I/O */ - xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3e8dc990d41c..132d8aa2afc4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -443,10 +443,6 @@ enum xfs_prealloc_flags { int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); -int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, - xfs_fsize_t isize, bool *did_zeroing); -int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, - bool *did_zero); /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d5037f060d6f..34b91b789702 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -502,8 +502,8 @@ STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) - __releases(&lip->li_ailp->xa_lock) - __acquires(&lip->li_ailp->xa_lock) + __releases(&lip->li_ailp->ail_lock) + __acquires(&lip->li_ailp->ail_lock) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; @@ -562,7 +562,7 @@ xfs_inode_item_push( ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - spin_unlock(&lip->li_ailp->xa_lock); + spin_unlock(&lip->li_ailp->ail_lock); error = xfs_iflush(ip, &bp); if (!error) { @@ -571,7 +571,7 @@ xfs_inode_item_push( xfs_buf_relse(bp); } - spin_lock(&lip->li_ailp->xa_lock); + spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; @@ -579,9 +579,6 @@ out_unlock: /* * Unlock the inode associated with the inode log item. - * Clear the fields of the inode and inode log item that - * are specific to the current transaction. If the - * hold flags is set, do not unlock the inode. */ STATIC void xfs_inode_item_unlock( @@ -637,10 +634,6 @@ xfs_inode_item_committed( return lsn; } -/* - * XXX rcc - this one really has to do something. Probably needs - * to stamp in a new field in the incore inode. - */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, @@ -759,7 +752,7 @@ xfs_iflush_done( bool mlip_changed = false; /* this is an opencoded batch version of xfs_trans_ail_delete */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) @@ -770,15 +763,15 @@ xfs_iflush_done( } if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); - if (list_empty(&ailp->xa_ail)) - wake_up_all(&ailp->xa_empty); + if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) + xlog_assign_tail_lsn_locked(ailp->ail_mount); + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (mlip_changed) - xfs_log_space_wake(ailp->xa_mount); + xfs_log_space_wake(ailp->ail_mount); } /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 66e1edbfb2b2..046469fcc1b8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -955,15 +955,29 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } +static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps) +{ + return nimaps && + imap->br_startblock != HOLESTARTBLOCK && + imap->br_state != XFS_EXT_UNWRITTEN; +} + static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) { /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. + * COW writes may allocate delalloc space or convert unwritten COW + * extents, so we need to make sure to take the lock exclusively here. */ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) return true; - if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + + /* + * Extents not yet cached requires exclusive access, don't block. + * This is an opencoded xfs_ilock_data_map_shared() to cater for the + * non-blocking behaviour. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + !(ip->i_df.if_flags & XFS_IFEXTENTS)) return true; return false; } @@ -993,16 +1007,18 @@ xfs_file_iomap_begin( return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } - if (need_excl_ilock(ip, flags)) { + if (need_excl_ilock(ip, flags)) lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + else + lockmode = XFS_ILOCK_SHARED; - if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = -EAGAIN; - goto out_unlock; + if (flags & IOMAP_NOWAIT) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return -EAGAIN; + if (!xfs_ilock_nowait(ip, lockmode)) + return -EAGAIN; + } else { + xfs_ilock(ip, lockmode); } ASSERT(offset <= mp->m_super->s_maxbytes); @@ -1024,7 +1040,9 @@ xfs_file_iomap_begin( goto out_unlock; } - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (xfs_is_reflink_inode(ip) && + ((flags & IOMAP_WRITE) || + ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) { if (flags & IOMAP_DIRECT) { /* * A reflinked inode will result in CoW alloc. diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 56475fcd76f2..e0307fbff911 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -46,6 +46,7 @@ #include <linux/security.h> #include <linux/iomap.h> #include <linux/slab.h> +#include <linux/iversion.h> /* * Directories have different lock order w.r.t. mmap_sem compared to regular @@ -874,7 +875,9 @@ xfs_setattr_size( * truncate. */ if (newsize > oldsize) { - error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); + trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); + error = iomap_zero_range(inode, oldsize, newsize - oldsize, + &did_zeroing, &xfs_iomap_ops); } else { error = iomap_truncate_page(inode, newsize, &did_zeroing, &xfs_iomap_ops); @@ -1052,11 +1055,21 @@ xfs_vn_update_time( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; trace_xfs_update_time(ip); + if (inode->i_sb->s_flags & SB_LAZYTIME) { + if (!((flags & S_VERSION) && + inode_maybe_inc_iversion(inode, false))) + return generic_update_time(inode, now, flags); + + /* Capture the iversion update that just occurred */ + log_flags |= XFS_ILOG_CORE; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); if (error) return error; @@ -1070,7 +1083,7 @@ xfs_vn_update_time( inode->i_atime = *now; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + xfs_trans_log_inode(tp, ip, log_flags); return xfs_trans_commit(tp); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3e5ba1ecc080..b9c9c848146b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -869,7 +869,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) return 0; } - error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + error = xfs_log_force(mp, XFS_LOG_SYNC); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG @@ -1149,7 +1149,7 @@ xlog_assign_tail_lsn_locked( struct xfs_log_item *lip; xfs_lsn_t tail_lsn; - assert_spin_locked(&mp->m_ail->xa_lock); + assert_spin_locked(&mp->m_ail->ail_lock); /* * To make sure we always have a valid LSN for the log tail we keep @@ -1172,9 +1172,9 @@ xlog_assign_tail_lsn( { xfs_lsn_t tail_lsn; - spin_lock(&mp->m_ail->xa_lock); + spin_lock(&mp->m_ail->ail_lock); tail_lsn = xlog_assign_tail_lsn_locked(mp); - spin_unlock(&mp->m_ail->xa_lock); + spin_unlock(&mp->m_ail->ail_lock); return tail_lsn; } @@ -3304,269 +3304,215 @@ xlog_state_switch_iclogs( * not in the active nor dirty state. */ int -_xfs_log_force( +xfs_log_force( struct xfs_mount *mp, - uint flags, - int *log_flushed) + uint flags) { struct xlog *log = mp->m_log; struct xlog_in_core *iclog; xfs_lsn_t lsn; XFS_STATS_INC(mp, xs_log_force); + trace_xfs_log_force(mp, 0, _RET_IP_); xlog_cil_force(log); spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; - /* If the head iclog is not active nor dirty, we just attach - * ourselves to the head and go to sleep. - */ - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) { + if (iclog->ic_state == XLOG_STATE_DIRTY || + (iclog->ic_state == XLOG_STATE_ACTIVE && + atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { /* - * If the head is dirty or (active and empty), then - * we need to look at the previous iclog. If the previous - * iclog is active or dirty we are done. There is nothing - * to sync out. Otherwise, we attach ourselves to the + * If the head is dirty or (active and empty), then we need to + * look at the previous iclog. + * + * If the previous iclog is active or dirty we are done. There + * is nothing to sync out. Otherwise, we attach ourselves to the * previous iclog and go to sleep. */ - if (iclog->ic_state == XLOG_STATE_DIRTY || - (atomic_read(&iclog->ic_refcnt) == 0 - && iclog->ic_offset == 0)) { - iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto no_sleep; - else - goto maybe_sleep; - } else { - if (atomic_read(&iclog->ic_refcnt) == 0) { - /* We are the only one with access to this - * iclog. Flush it out now. There should - * be a roundoff of zero to show that someone - * has already taken care of the roundoff from - * the previous sync. - */ - atomic_inc(&iclog->ic_refcnt); - lsn = be64_to_cpu(iclog->ic_header.h_lsn); - xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); + iclog = iclog->ic_prev; + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + goto out_unlock; + } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { + if (atomic_read(&iclog->ic_refcnt) == 0) { + /* + * We are the only one with access to this iclog. + * + * Flush it out now. There should be a roundoff of zero + * to show that someone has already taken care of the + * roundoff from the previous sync. + */ + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); - if (xlog_state_release_iclog(log, iclog)) - return -EIO; - - if (log_flushed) - *log_flushed = 1; - spin_lock(&log->l_icloglock); - if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && - iclog->ic_state != XLOG_STATE_DIRTY) - goto maybe_sleep; - else - goto no_sleep; - } else { - /* Someone else is writing to this iclog. - * Use its call to flush out the data. However, - * the other thread may not force out this LR, - * so we mark it WANT_SYNC. - */ - xlog_state_switch_iclogs(log, iclog, 0); - goto maybe_sleep; - } - } - } + if (xlog_state_release_iclog(log, iclog)) + return -EIO; - /* By the time we come around again, the iclog could've been filled - * which would give it another lsn. If we have a new lsn, just - * return because the relevant data has been flushed. - */ -maybe_sleep: - if (flags & XFS_LOG_SYNC) { - /* - * We must check if we're shutting down here, before - * we wait, while we're holding the l_icloglock. - * Then we check again after waking up, in case our - * sleep was disturbed by a bad news. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || + iclog->ic_state == XLOG_STATE_DIRTY) + goto out_unlock; + } else { + /* + * Someone else is writing to this iclog. + * + * Use its call to flush out the data. However, the + * other thread may not force out this LR, so we mark + * it WANT_SYNC. + */ + xlog_state_switch_iclogs(log, iclog, 0); } - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. + * If the head iclog is not active nor dirty, we just attach + * ourselves to the head and go to sleep if necessary. */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - } else { - -no_sleep: - spin_unlock(&log->l_icloglock); + ; } + + if (!(flags & XFS_LOG_SYNC)) + goto out_unlock; + + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; + XFS_STATS_INC(mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; return 0; -} -/* - * Wrapper for _xfs_log_force(), to be used when caller doesn't care - * about errors or whether the log was flushed or not. This is the normal - * interface to use when trying to unpin items or move the log forward. - */ -void -xfs_log_force( - xfs_mount_t *mp, - uint flags) -{ - trace_xfs_log_force(mp, 0, _RET_IP_); - _xfs_log_force(mp, flags, NULL); +out_unlock: + spin_unlock(&log->l_icloglock); + return 0; +out_error: + spin_unlock(&log->l_icloglock); + return -EIO; } -/* - * Force the in-core log to disk for a specific LSN. - * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. - * - * Synchronous forces are implemented with a signal variable. All callers - * to force a given lsn to disk will wait on a the sv attached to the - * specific in-core log. When given in-core log finally completes its - * write to disk, that thread will wake up all threads waiting on the - * sv. - */ -int -_xfs_log_force_lsn( +static int +__xfs_log_force_lsn( struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, - int *log_flushed) + int *log_flushed, + bool already_slept) { struct xlog *log = mp->m_log; struct xlog_in_core *iclog; - int already_slept = 0; - ASSERT(lsn != 0); - - XFS_STATS_INC(mp, xs_log_force); - - lsn = xlog_cil_force_lsn(log, lsn); - if (lsn == NULLCOMMITLSN) - return 0; - -try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; - do { - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { - iclog = iclog->ic_next; - continue; - } + while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + if (iclog == log->l_iclog) + goto out_unlock; + } - if (iclog->ic_state == XLOG_STATE_DIRTY) { - spin_unlock(&log->l_icloglock); - return 0; - } + if (iclog->ic_state == XLOG_STATE_DIRTY) + goto out_unlock; - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - /* - * We sleep here if we haven't already slept (e.g. - * this is the first time we've looked at the correct - * iclog buf) and the buffer before us is going to - * be sync'ed. The reason for this is that if we - * are doing sync transactions here, by waiting for - * the previous I/O to complete, we can allow a few - * more transactions into this iclog before we close - * it down. - * - * Otherwise, we mark the buffer WANT_SYNC, and bump - * up the refcnt so we can release the log (which - * drops the ref count). The state switch keeps new - * transaction commits from using this buffer. When - * the current commits finish writing into the buffer, - * the refcount will drop to zero and the buffer will - * go out then. - */ - if (!already_slept && - (iclog->ic_prev->ic_state & - (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. this is the + * first time we've looked at the correct iclog buf) and the + * buffer before us is going to be sync'ed. The reason for this + * is that if we are doing sync transactions here, by waiting + * for the previous I/O to complete, we can allow a few more + * transactions into this iclog before we close it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump up the + * refcnt so we can release the log (which drops the ref count). + * The state switch keeps new transaction commits from using + * this buffer. When the current commits finish writing into + * the buffer, the refcount will drop to zero and the buffer + * will go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(mp, xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_prev->ic_write_wait, - &log->l_icloglock); - already_slept = 1; - goto try_again; - } - atomic_inc(&iclog->ic_refcnt); - xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); - if (xlog_state_release_iclog(log, iclog)) - return -EIO; - if (log_flushed) - *log_flushed = 1; - spin_lock(&log->l_icloglock); + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + return -EAGAIN; } + atomic_inc(&iclog->ic_refcnt); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + if (xlog_state_release_iclog(log, iclog)) + return -EIO; + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + } - if ((flags & XFS_LOG_SYNC) && /* sleep */ - !(iclog->ic_state & - (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { - /* - * Don't wait on completion if we know that we've - * gotten a log write error. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - } else { /* just return */ - spin_unlock(&log->l_icloglock); - } + if (!(flags & XFS_LOG_SYNC) || + (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) + goto out_unlock; - return 0; - } while (iclog != log->l_iclog); + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; + + XFS_STATS_INC(mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + return 0; +out_unlock: spin_unlock(&log->l_icloglock); return 0; +out_error: + spin_unlock(&log->l_icloglock); + return -EIO; } /* - * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care - * about errors or whether the log was flushed or not. This is the normal - * interface to use when trying to unpin items or move the log forward. + * Force the in-core log to disk for a specific LSN. + * + * Find in-core log with lsn. + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. */ -void +int xfs_log_force_lsn( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags) + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) { + int ret; + ASSERT(lsn != 0); + + XFS_STATS_INC(mp, xs_log_force); trace_xfs_log_force(mp, lsn, _RET_IP_); - _xfs_log_force_lsn(mp, lsn, flags, NULL); + + lsn = xlog_cil_force_lsn(mp->m_log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + + ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) + ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + return ret; } /* @@ -4035,7 +3981,7 @@ xfs_log_force_umount( * to guarantee this. */ if (!logerror) - _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + xfs_log_force(mp, XFS_LOG_SYNC); /* * mark the filesystem and the as in a shutdown state and wake diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index bf212772595c..7e2d62922a16 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -129,18 +129,9 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, bool regrant); -int _xfs_log_force(struct xfs_mount *mp, - uint flags, - int *log_forced); -void xfs_log_force(struct xfs_mount *mp, - uint flags); -int _xfs_log_force_lsn(struct xfs_mount *mp, - xfs_lsn_t lsn, - uint flags, - int *log_forced); -void xfs_log_force_lsn(struct xfs_mount *mp, - xfs_lsn_t lsn, - uint flags); +int xfs_log_force(struct xfs_mount *mp, uint flags); +int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, + int *log_forced); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 43aa42a3a5d3..cb376ac8a595 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -202,7 +202,7 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 00240c9ee72e..2b2383f1895e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3173,13 +3173,6 @@ xlog_recover_inode_pass2( /* recover the log dinode inode into the on disk inode */ xfs_log_dinode_to_disk(ldip, dip); - /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > isize) { - memcpy((char *)dip + isize, - item->ri_buf[1].i_addr + isize, - item->ri_buf[1].i_len - isize); - } - fields = in_f->ilf_fields; if (fields & XFS_ILOG_DEV) xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); @@ -3252,7 +3245,9 @@ xlog_recover_inode_pass2( } out_owner_change: - if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + /* Recover the swapext owner change unless inode has been deleted */ + if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && + (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); /* re-generate the checksum. */ @@ -3434,7 +3429,7 @@ xlog_recover_efi_pass2( } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The EFI has two references. One for the EFD and one for EFI to ensure * it makes it into the AIL. Insert the EFI into the AIL directly and @@ -3477,7 +3472,7 @@ xlog_recover_efd_pass2( * Search for the EFI with the id in the EFD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { @@ -3487,9 +3482,9 @@ xlog_recover_efd_pass2( * Drop the EFD reference to the EFI. This * removes the EFI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_efi_release(efip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3497,7 +3492,7 @@ xlog_recover_efd_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -3530,7 +3525,7 @@ xlog_recover_rui_pass2( } atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The RUI has two references. One for the RUD and one for RUI to ensure * it makes it into the AIL. Insert the RUI into the AIL directly and @@ -3570,7 +3565,7 @@ xlog_recover_rud_pass2( * Search for the RUI with the id in the RUD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_RUI) { @@ -3580,9 +3575,9 @@ xlog_recover_rud_pass2( * Drop the RUD reference to the RUI. This * removes the RUI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_rui_release(ruip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3590,7 +3585,7 @@ xlog_recover_rud_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -3646,7 +3641,7 @@ xlog_recover_cui_pass2( } atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The CUI has two references. One for the CUD and one for CUI to ensure * it makes it into the AIL. Insert the CUI into the AIL directly and @@ -3687,7 +3682,7 @@ xlog_recover_cud_pass2( * Search for the CUI with the id in the CUD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_CUI) { @@ -3697,9 +3692,9 @@ xlog_recover_cud_pass2( * Drop the CUD reference to the CUI. This * removes the CUI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_cui_release(cuip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3707,7 +3702,7 @@ xlog_recover_cud_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -3765,7 +3760,7 @@ xlog_recover_bui_pass2( } atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The RUI has two references. One for the RUD and one for RUI to ensure * it makes it into the AIL. Insert the RUI into the AIL directly and @@ -3806,7 +3801,7 @@ xlog_recover_bud_pass2( * Search for the BUI with the id in the BUD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_BUI) { @@ -3816,9 +3811,9 @@ xlog_recover_bud_pass2( * Drop the BUD reference to the BUI. This * removes the BUI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_bui_release(buip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3826,7 +3821,7 @@ xlog_recover_bud_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -4659,9 +4654,9 @@ xlog_recover_process_efi( if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_efi_recover(mp, efip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4677,9 +4672,9 @@ xlog_recover_cancel_efi( efip = container_of(lip, struct xfs_efi_log_item, efi_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_efi_release(efip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Recover the RUI if necessary. */ @@ -4699,9 +4694,9 @@ xlog_recover_process_rui( if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_rui_recover(mp, ruip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4717,9 +4712,9 @@ xlog_recover_cancel_rui( ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_rui_release(ruip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Recover the CUI if necessary. */ @@ -4740,9 +4735,9 @@ xlog_recover_process_cui( if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_cui_recover(mp, cuip, dfops); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4758,9 +4753,9 @@ xlog_recover_cancel_cui( cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_cui_release(cuip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Recover the BUI if necessary. */ @@ -4781,9 +4776,9 @@ xlog_recover_process_bui( if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_bui_recover(mp, buip, dfops); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4799,9 +4794,9 @@ xlog_recover_cancel_bui( buip = container_of(lip, struct xfs_bui_log_item, bui_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_bui_release(buip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Is this log item a deferred action intent? */ @@ -4889,7 +4884,7 @@ xlog_recover_process_intents( #endif ailp = log->l_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); @@ -4943,7 +4938,7 @@ xlog_recover_process_intents( } out: xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (error) xfs_defer_cancel(&dfops); else @@ -4966,7 +4961,7 @@ xlog_recover_cancel_intents( struct xfs_ail *ailp; ailp = log->l_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* @@ -5000,7 +4995,7 @@ xlog_recover_cancel_intents( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return error; } @@ -5127,16 +5122,9 @@ xlog_recover_process_iunlinks( xfs_agino_t agino; int bucket; int error; - uint mp_dmevmask; mp = log->l_mp; - /* - * Prevent any DMAPI event from being sent while in this function. - */ - mp_dmevmask = mp->m_dmevmask; - mp->m_dmevmask = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { /* * Find the agi for this ag. @@ -5172,8 +5160,6 @@ xlog_recover_process_iunlinks( } xfs_buf_rele(agibp); } - - mp->m_dmevmask = mp_dmevmask; } STATIC int diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 98fd41cbb9e1..a901b86772f8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -803,8 +803,6 @@ xfs_mountfs( get_unaligned_be16(&sbp->sb_uuid.b[4]); mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); - mp->m_dmevmask = 0; /* not persistent; set after each mount */ - error = xfs_da_mount(mp); if (error) { xfs_warn(mp, "Failed dir/attr init: %d", error); @@ -819,8 +817,6 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - spin_lock_init(&mp->m_perag_lock); - INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e0792d036be2..10b90bbc5162 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -138,7 +138,6 @@ typedef struct xfs_mount { spinlock_t m_perag_lock; /* lock for m_perag_tree */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ - uint m_dmevmask; /* DMI events for this FS */ uint64_t m_flags; /* global mount flags */ bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ @@ -326,8 +325,9 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) /* per-AG block reservation data structures*/ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, - XFS_AG_RESV_METADATA, XFS_AG_RESV_AGFL, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_RMAPBT, }; struct xfs_ag_resv { @@ -353,6 +353,7 @@ typedef struct xfs_perag { char pagi_inodeok; /* The agi is ok for inodes */ uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ + bool pagf_agflreset; /* agfl requires reset before use */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ @@ -391,8 +392,8 @@ typedef struct xfs_perag { /* Blocks reserved for all kinds of metadata. */ struct xfs_ag_resv pag_meta_resv; - /* Blocks reserved for just AGFL-based metadata. */ - struct xfs_ag_resv pag_agfl_resv; + /* Blocks reserved for the reverse mapping btree. */ + struct xfs_ag_resv pag_rmapbt_resv; /* reference count */ uint8_t pagf_refcount_level; @@ -406,8 +407,8 @@ xfs_perag_resv( switch (type) { case XFS_AG_RESV_METADATA: return &pag->pag_meta_resv; - case XFS_AG_RESV_AGFL: - return &pag->pag_agfl_resv; + case XFS_AG_RESV_RMAPBT: + return &pag->pag_rmapbt_resv; default: return NULL; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 3a55d6fc271b..7a39f40645f7 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -456,10 +457,12 @@ xfs_cui_recover( * transaction. Normally, any work that needs to be deferred * gets attached to the same defer_ops that scheduled the * refcount update. However, we're in log recovery here, so we - * we create our own defer_ops and use that to finish up any - * work that doesn't fit. + * we use the passed in defer_ops and to finish up any work that + * doesn't fit. We need to reserve enough blocks to handle a + * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; cudp = xfs_trans_get_cud(tp, cuip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 270246943a06..cdbd342a5249 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -394,7 +394,7 @@ xfs_reflink_allocate_cow( retry: ASSERT(xfs_is_reflink_inode(ip)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * Even if the extent is not shared we might have a preallocation for @@ -668,7 +668,7 @@ xfs_reflink_cancel_cow_range( /* Start a rolling transaction to remove the mappings */ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - 0, 0, 0, &tp); + 0, 0, XFS_TRANS_NOFS, &tp); if (error) goto out; @@ -741,7 +741,7 @@ xfs_reflink_end_cow( (unsigned int)(end_fsb - offset_fsb), XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, XFS_TRANS_RESERVE, &tp); + resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); if (error) goto out; @@ -762,10 +762,8 @@ xfs_reflink_end_cow( xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); /* Extent delete may have bumped ext forward */ - if (!del.br_blockcount) { - xfs_iext_prev(ifp, &icur); - goto next_extent; - } + if (!del.br_blockcount) + goto prev_extent; ASSERT(!isnullstartblock(got.br_startblock)); @@ -774,10 +772,8 @@ xfs_reflink_end_cow( * speculatively preallocated CoW extents that have been * allocated but have not yet been involved in a write. */ - if (got.br_state == XFS_EXT_UNWRITTEN) { - xfs_iext_prev(ifp, &icur); - goto next_extent; - } + if (got.br_state == XFS_EXT_UNWRITTEN) + goto prev_extent; /* Unmap the old blocks in the data fork. */ xfs_defer_init(&dfops, &firstfsb); @@ -816,9 +812,12 @@ xfs_reflink_end_cow( error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; -next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) break; + continue; +prev_extent: + if (!xfs_iext_prev_extent(ifp, &icur, &got)) + break; } error = xfs_trans_commit(tp); @@ -1061,7 +1060,7 @@ xfs_reflink_ag_has_free_space( return 0; pag = xfs_perag_get(mp, agno); - if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || + if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) error = -ENOSPC; xfs_perag_put(pag); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index f3b139c9aa16..49d3124863a8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -470,7 +471,8 @@ xfs_rui_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7aba628dc527..612c1d5348b3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -250,6 +250,7 @@ xfs_parseargs( return -EINVAL; break; case Opt_logdev: + kfree(mp->m_logname); mp->m_logname = match_strdup(args); if (!mp->m_logname) return -ENOMEM; @@ -258,6 +259,7 @@ xfs_parseargs( xfs_warn(mp, "%s option not allowed on this system", p); return -EINVAL; case Opt_rtdev: + kfree(mp->m_rtname); mp->m_rtname = match_strdup(args); if (!mp->m_rtname) return -ENOMEM; @@ -970,7 +972,6 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); - int error; trace_xfs_destroy_inode(ip); @@ -978,14 +979,6 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); - if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); - if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) - xfs_warn(ip->i_mount, -"Error %d while evicting CoW blocks for inode %llu.", - error, ip->i_ino); - } - xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -1007,6 +1000,28 @@ xfs_fs_destroy_inode( xfs_inode_set_reclaim_tag(ip); } +static void +xfs_fs_dirty_inode( + struct inode *inode, + int flag) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + + if (!(inode->i_sb->s_flags & SB_LAZYTIME)) + return; + if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME)) + return; + + if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp)) + return; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + xfs_trans_commit(tp); +} + /* * Slab object creation initialisation for the XFS inode. * This covers only the idempotent fields in the XFS inode; @@ -1564,29 +1579,48 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_fdblocks); } -STATIC int -xfs_fs_fill_super( - struct super_block *sb, - void *data, - int silent) +static struct xfs_mount * +xfs_mount_alloc( + struct super_block *sb) { - struct inode *root; - struct xfs_mount *mp = NULL; - int flags = 0, error = -ENOMEM; + struct xfs_mount *mp; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) - goto out; + return NULL; + mp->m_super = sb; spin_lock_init(&mp->m_sb_lock); + spin_lock_init(&mp->m_agirotor_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + return mp; +} - mp->m_super = sb; + +STATIC int +xfs_fs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + struct inode *root; + struct xfs_mount *mp = NULL; + int flags = 0, error = -ENOMEM; + + /* + * allocate mp and do all low-level struct initializations before we + * attach it to the super + */ + mp = xfs_mount_alloc(sb); + if (!mp) + goto out; sb->s_fs_info = mp; error = xfs_parseargs(mp, (char *)data); @@ -1787,6 +1821,7 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 945de08af7ba..a982c0b623d0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1477,7 +1477,7 @@ TRACE_EVENT(xfs_extent_busy_trim, __entry->tlen) ); -TRACE_EVENT(xfs_agf, +DECLARE_EVENT_CLASS(xfs_agf_class, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), TP_ARGS(mp, agf, flags, caller_ip), @@ -1533,6 +1533,13 @@ TRACE_EVENT(xfs_agf, __entry->longest, (void *)__entry->caller_ip) ); +#define DEFINE_AGF_EVENT(name) \ +DEFINE_EVENT(xfs_agf_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agf, flags, caller_ip)) +DEFINE_AGF_EVENT(xfs_agf); +DEFINE_AGF_EVENT(xfs_agfl_reset); TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 86f92df32c42..d6d8f9d129a7 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -119,8 +119,11 @@ xfs_trans_dup( /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); + + ASSERT(tp->t_blk_res >= tp->t_blk_res_used); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; + ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; ntp->t_pflags = tp->t_pflags; @@ -344,13 +347,14 @@ xfs_trans_mod_sb( break; case XFS_TRANS_SB_FDBLOCKS: /* - * Track the number of blocks allocated in the - * transaction. Make sure it does not exceed the - * number reserved. + * Track the number of blocks allocated in the transaction. + * Make sure it does not exceed the number reserved. If so, + * shutdown as this can lead to accounting inconsistency. */ if (delta < 0) { tp->t_blk_res_used += (uint)-delta; - ASSERT(tp->t_blk_res_used <= tp->t_blk_res); + if (tp->t_blk_res_used > tp->t_blk_res) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) @@ -803,8 +807,8 @@ xfs_log_item_batch_insert( { int i; - spin_lock(&ailp->xa_lock); - /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + spin_lock(&ailp->ail_lock); + /* xfs_trans_ail_update_bulk drops ailp->ail_lock */ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); for (i = 0; i < nr_items; i++) { @@ -847,9 +851,9 @@ xfs_trans_committed_bulk( struct xfs_ail_cursor cur; int i = 0; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); /* unpin all the log items */ for (lv = log_vector; lv; lv = lv->lv_next ) { @@ -869,7 +873,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount)); lip->li_ops->iop_unpin(lip, 1); continue; } @@ -883,11 +887,11 @@ xfs_trans_committed_bulk( * not affect the AIL cursor the bulk insert path is * using. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) xfs_trans_ail_update(ailp, lip, item_lsn); else - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); lip->li_ops->iop_unpin(lip, 0); continue; } @@ -905,9 +909,9 @@ xfs_trans_committed_bulk( if (i) xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } /* @@ -966,7 +970,7 @@ __xfs_trans_commit( * log out now and wait for it. */ if (sync) { - error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); XFS_STATS_INC(mp, xs_trans_sync); } else { XFS_STATS_INC(mp, xs_trans_async); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index cef89f7127d3..d4a2445215e6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -40,7 +40,7 @@ xfs_ail_check( { xfs_log_item_t *prev_lip; - if (list_empty(&ailp->xa_ail)) + if (list_empty(&ailp->ail_head)) return; /* @@ -48,11 +48,11 @@ xfs_ail_check( */ ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->xa_ail) + if (&prev_lip->li_ail != &ailp->ail_head) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->xa_ail) + if (&prev_lip->li_ail != &ailp->ail_head) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); @@ -69,10 +69,10 @@ static xfs_log_item_t * xfs_ail_max( struct xfs_ail *ailp) { - if (list_empty(&ailp->xa_ail)) + if (list_empty(&ailp->ail_head)) return NULL; - return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); + return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail); } /* @@ -84,7 +84,7 @@ xfs_ail_next( struct xfs_ail *ailp, xfs_log_item_t *lip) { - if (lip->li_ail.next == &ailp->xa_ail) + if (lip->li_ail.next == &ailp->ail_head) return NULL; return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); @@ -105,11 +105,11 @@ xfs_ail_min_lsn( xfs_lsn_t lsn = 0; xfs_log_item_t *lip; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_ail_min(ailp); if (lip) lsn = lip->li_lsn; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return lsn; } @@ -124,11 +124,11 @@ xfs_ail_max_lsn( xfs_lsn_t lsn = 0; xfs_log_item_t *lip; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_ail_max(ailp); if (lip) lsn = lip->li_lsn; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return lsn; } @@ -146,7 +146,7 @@ xfs_trans_ail_cursor_init( struct xfs_ail_cursor *cur) { cur->item = NULL; - list_add_tail(&cur->list, &ailp->xa_cursors); + list_add_tail(&cur->list, &ailp->ail_cursors); } /* @@ -194,7 +194,7 @@ xfs_trans_ail_cursor_clear( { struct xfs_ail_cursor *cur; - list_for_each_entry(cur, &ailp->xa_cursors, list) { + list_for_each_entry(cur, &ailp->ail_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) ((uintptr_t)cur->item | 1); @@ -222,7 +222,7 @@ xfs_trans_ail_cursor_first( goto out; } - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + list_for_each_entry(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) goto out; } @@ -241,7 +241,7 @@ __xfs_trans_ail_cursor_last( { xfs_log_item_t *lip; - list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) return lip; } @@ -310,7 +310,7 @@ xfs_ail_splice( if (lip) list_splice(list, &lip->li_ail); else - list_splice(list, &ailp->xa_ail); + list_splice(list, &ailp->ail_head); } /* @@ -335,17 +335,17 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; - return lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->xa_mount; + xfs_mount_t *mp = ailp->ail_mount; struct xfs_ail_cursor cur; xfs_log_item_t *lip; xfs_lsn_t lsn; @@ -360,30 +360,30 @@ xfsaild_push( * buffers the last time we ran, force the log first and wait for it * before pushing again. */ - if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && - (!list_empty_careful(&ailp->xa_buf_list) || + if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->ail_buf_list) || xfs_ail_min_lsn(ailp))) { - ailp->xa_log_flush = 0; + ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); } - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); - /* barrier matches the xa_target update in xfs_ail_push() */ + /* barrier matches the ail_target update in xfs_ail_push() */ smp_rmb(); - target = ailp->xa_target; - ailp->xa_target_prev = target; + target = ailp->ail_target; + ailp->ail_target_prev = target; - lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); if (!lip) { /* * If the AIL is empty or our push has reached the end we are * done now. */ xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); goto out_done; } @@ -404,7 +404,7 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail_success); trace_xfs_ail_push(lip); - ailp->xa_last_pushed_lsn = lsn; + ailp->ail_last_pushed_lsn = lsn; break; case XFS_ITEM_FLUSHING: @@ -423,7 +423,7 @@ xfsaild_push( trace_xfs_ail_flushing(lip); flushing++; - ailp->xa_last_pushed_lsn = lsn; + ailp->ail_last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: @@ -431,7 +431,7 @@ xfsaild_push( trace_xfs_ail_pinned(lip); stuck++; - ailp->xa_log_flush++; + ailp->ail_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(mp, xs_push_ail_locked); @@ -468,10 +468,10 @@ xfsaild_push( lsn = lip->li_lsn; } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); - if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) - ailp->xa_log_flush++; + if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list)) + ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, target) >= 0) { out_done: @@ -481,7 +481,7 @@ out_done: * AIL before we start the next scan from the start of the AIL. */ tout = 50; - ailp->xa_last_pushed_lsn = 0; + ailp->ail_last_pushed_lsn = 0; } else if (((stuck + flushing) * 100) / count > 90) { /* * Either there is a lot of contention on the AIL or we are @@ -494,7 +494,7 @@ out_done: * the restart to issue a log force to unpin the stuck items. */ tout = 20; - ailp->xa_last_pushed_lsn = 0; + ailp->ail_last_pushed_lsn = 0; } else { /* * Assume we have more work to do in a short while. @@ -536,26 +536,26 @@ xfsaild( break; } - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); /* * Idle if the AIL is empty and we are not racing with a target * update. We check the AIL after we set the task to a sleep - * state to guarantee that we either catch an xa_target update + * state to guarantee that we either catch an ail_target update * or that a wake_up resets the state to TASK_RUNNING. * Otherwise, we run the risk of sleeping indefinitely. * - * The barrier matches the xa_target update in xfs_ail_push(). + * The barrier matches the ail_target update in xfs_ail_push(). */ smp_rmb(); if (!xfs_ail_min(ailp) && - ailp->xa_target == ailp->xa_target_prev) { - spin_unlock(&ailp->xa_lock); + ailp->ail_target == ailp->ail_target_prev) { + spin_unlock(&ailp->ail_lock); freezable_schedule(); tout = 0; continue; } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (tout) freezable_schedule_timeout(msecs_to_jiffies(tout)); @@ -592,8 +592,8 @@ xfs_ail_push( xfs_log_item_t *lip; lip = xfs_ail_min(ailp); - if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || - XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; /* @@ -601,10 +601,10 @@ xfs_ail_push( * the XFS_AIL_PUSHING_BIT. */ smp_wmb(); - xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); + xfs_trans_ail_copy_lsn(ailp, &ailp->ail_target, &threshold_lsn); smp_wmb(); - wake_up_process(ailp->xa_task); + wake_up_process(ailp->ail_task); } /* @@ -630,18 +630,18 @@ xfs_ail_push_all_sync( struct xfs_log_item *lip; DEFINE_WAIT(wait); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); while ((lip = xfs_ail_max(ailp)) != NULL) { - prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); - ailp->xa_target = lip->li_lsn; - wake_up_process(ailp->xa_task); - spin_unlock(&ailp->xa_lock); + prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->ail_target = lip->li_lsn; + wake_up_process(ailp->ail_task); + spin_unlock(&ailp->ail_lock); schedule(); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); - finish_wait(&ailp->xa_empty, &wait); + finish_wait(&ailp->ail_empty, &wait); } /* @@ -672,7 +672,7 @@ xfs_trans_ail_update_bulk( struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, - xfs_lsn_t lsn) __releases(ailp->xa_lock) + xfs_lsn_t lsn) __releases(ailp->ail_lock) { xfs_log_item_t *mlip; int mlip_changed = 0; @@ -705,13 +705,13 @@ xfs_trans_ail_update_bulk( xfs_ail_splice(ailp, cur, &tmp, lsn); if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); - spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) + xlog_assign_tail_lsn_locked(ailp->ail_mount); + spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(ailp->xa_mount); + xfs_log_space_wake(ailp->ail_mount); } else { - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } } @@ -756,13 +756,13 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->xa_lock) + int shutdown_type) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->xa_mount; + struct xfs_mount *mp = ailp->ail_mount; bool mlip_changed; if (!(lip->li_flags & XFS_LI_IN_AIL)) { - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", @@ -776,13 +776,13 @@ xfs_trans_ail_delete( if (mlip_changed) { if (!XFS_FORCED_SHUTDOWN(mp)) xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->xa_ail)) - wake_up_all(&ailp->xa_empty); + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (mlip_changed) - xfs_log_space_wake(ailp->xa_mount); + xfs_log_space_wake(ailp->ail_mount); } int @@ -795,16 +795,16 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM; - ailp->xa_mount = mp; - INIT_LIST_HEAD(&ailp->xa_ail); - INIT_LIST_HEAD(&ailp->xa_cursors); - spin_lock_init(&ailp->xa_lock); - INIT_LIST_HEAD(&ailp->xa_buf_list); - init_waitqueue_head(&ailp->xa_empty); + ailp->ail_mount = mp; + INIT_LIST_HEAD(&ailp->ail_head); + INIT_LIST_HEAD(&ailp->ail_cursors); + spin_lock_init(&ailp->ail_lock); + INIT_LIST_HEAD(&ailp->ail_buf_list); + init_waitqueue_head(&ailp->ail_empty); - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->xa_mount->m_fsname); - if (IS_ERR(ailp->xa_task)) + ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->ail_mount->m_fsname); + if (IS_ERR(ailp->ail_task)) goto out_free_ailp; mp->m_ail = ailp; @@ -821,6 +821,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - kthread_stop(ailp->xa_task); + kthread_stop(ailp->ail_task); kmem_free(ailp); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 653ce379d36b..a5d9dfc45d98 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -431,8 +431,8 @@ xfs_trans_brelse( * If the fs has shutdown and we dropped the last reference, it may fall * on us to release a (possibly dirty) bli if it never made it to the * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need xa_lock, - * just force remove from the AIL and release the bli here. + * due to our reference). Since we're already shutdown and need + * ail_lock, just force remove from the AIL and release the bli here. */ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 4a89da4b6fe7..07cea592dc01 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -98,10 +98,24 @@ xfs_trans_log_inode( xfs_inode_t *ip, uint flags) { + struct inode *inode = VFS_I(ip); + ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* + * Don't bother with i_lock for the I_DIRTY_TIME check here, as races + * don't matter - we either will need an extra transaction in 24 hours + * to log the timestamps, or will clear already cleared fields in the + * worst case. + */ + if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + spin_unlock(&inode->i_lock); + } + + /* * Record the specific change for fdatasync optimisation. This * allows fdatasync to skip log forces for inodes that are only * timestamp dirty. We do this before the change count so that diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index b317a3644c00..be24b0c8a332 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -65,17 +65,17 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *xa_mount; - struct task_struct *xa_task; - struct list_head xa_ail; - xfs_lsn_t xa_target; - xfs_lsn_t xa_target_prev; - struct list_head xa_cursors; - spinlock_t xa_lock; - xfs_lsn_t xa_last_pushed_lsn; - int xa_log_flush; - struct list_head xa_buf_list; - wait_queue_head_t xa_empty; + struct xfs_mount *ail_mount; + struct task_struct *ail_task; + struct list_head ail_head; + xfs_lsn_t ail_target; + xfs_lsn_t ail_target_prev; + struct list_head ail_cursors; + spinlock_t ail_lock; + xfs_lsn_t ail_last_pushed_lsn; + int ail_log_flush; + struct list_head ail_buf_list; + wait_queue_head_t ail_empty; }; /* @@ -84,7 +84,7 @@ struct xfs_ail { void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, - xfs_lsn_t lsn) __releases(ailp->xa_lock); + xfs_lsn_t lsn) __releases(ailp->ail_lock); /* * Return a pointer to the first item in the AIL. If the AIL is empty, then * return NULL. @@ -93,7 +93,7 @@ static inline struct xfs_log_item * xfs_ail_min( struct xfs_ail *ailp) { - return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + return list_first_entry_or_null(&ailp->ail_head, struct xfs_log_item, li_ail); } @@ -101,14 +101,14 @@ static inline void xfs_trans_ail_update( struct xfs_ail *ailp, struct xfs_log_item *lip, - xfs_lsn_t lsn) __releases(ailp->xa_lock) + xfs_lsn_t lsn) __releases(ailp->ail_lock) { xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->xa_lock); + int shutdown_type) __releases(ailp->ail_lock); static inline void xfs_trans_ail_remove( @@ -117,12 +117,12 @@ xfs_trans_ail_remove( { struct xfs_ail *ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); /* xfs_trans_ail_delete() drops the AIL lock */ if (lip->li_flags & XFS_LI_IN_AIL) xfs_trans_ail_delete(ailp, lip, shutdown_type); else - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); @@ -149,9 +149,9 @@ xfs_trans_ail_copy_lsn( xfs_lsn_t *src) { ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); *dst = *src; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } #else static inline void @@ -172,7 +172,7 @@ xfs_clear_li_failed( struct xfs_buf *bp = lip->li_buf; ASSERT(lip->li_flags & XFS_LI_IN_AIL); - lockdep_assert_held(&lip->li_ailp->xa_lock); + lockdep_assert_held(&lip->li_ailp->ail_lock); if (lip->li_flags & XFS_LI_FAILED) { lip->li_flags &= ~XFS_LI_FAILED; @@ -186,7 +186,7 @@ xfs_set_li_failed( struct xfs_log_item *lip, struct xfs_buf *bp) { - lockdep_assert_held(&lip->li_ailp->xa_lock); + lockdep_assert_held(&lip->li_ailp->ail_lock); if (!(lip->li_flags & XFS_LI_FAILED)) { xfs_buf_hold(bp); |