diff options
Diffstat (limited to 'fs')
451 files changed, 16934 insertions, 16787 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index f2ba131cede1..55e108e5e133 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -16,186 +16,61 @@ #include "v9fs.h" #include "cache.h" -#define CACHETAG_LEN 11 - -struct fscache_netfs v9fs_cache_netfs = { - .name = "9p", - .version = 0, -}; - -/* - * v9fs_random_cachetag - Generate a random tag to be associated - * with a new cache session. - * - * The value of jiffies is used for a fairly randomly cache tag. - */ - -static -int v9fs_random_cachetag(struct v9fs_session_info *v9ses) +int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, + const char *dev_name) { - v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL); - if (!v9ses->cachetag) - return -ENOMEM; + struct fscache_volume *vcookie; + char *name, *p; - return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); -} - -const struct fscache_cookie_def v9fs_cache_session_index_def = { - .name = "9P.session", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; + name = kasprintf(GFP_KERNEL, "9p,%s,%s", + dev_name, v9ses->cachetag ?: v9ses->aname); + if (!name) + return -ENOMEM; -void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) -{ - /* If no cache session tag was specified, we generate a random one. */ - if (!v9ses->cachetag) { - if (v9fs_random_cachetag(v9ses) < 0) { - v9ses->fscache = NULL; - kfree(v9ses->cachetag); - v9ses->cachetag = NULL; - return; + for (p = name; *p; p++) + if (*p == '/') + *p = ';'; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + p9_debug(P9_DEBUG_FSC, "session %p get volume %p (%s)\n", + v9ses, vcookie, name); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); } + pr_err("Cache volume key already in use (%s)\n", name); + vcookie = NULL; } - - v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, - &v9fs_cache_session_index_def, - v9ses->cachetag, - strlen(v9ses->cachetag), - NULL, 0, - v9ses, 0, true); - p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", - v9ses, v9ses->fscache); -} - -void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) -{ - p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", - v9ses, v9ses->fscache); - fscache_relinquish_cookie(v9ses->fscache, NULL, false); - v9ses->fscache = NULL; -} - -static enum -fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - - if (buflen != sizeof(v9inode->qid.version)) - return FSCACHE_CHECKAUX_OBSOLETE; - - if (memcmp(buffer, &v9inode->qid.version, - sizeof(v9inode->qid.version))) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; + v9ses->fscache = vcookie; + kfree(name); + return 0; } -const struct fscache_cookie_def v9fs_cache_inode_index_def = { - .name = "9p.inode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = v9fs_cache_inode_check_aux, -}; - void v9fs_cache_inode_get_cookie(struct inode *inode) { struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; + __le32 version; + __le64 path; if (!S_ISREG(inode->i_mode)) return; v9inode = V9FS_I(inode); - if (v9inode->fscache) + if (WARN_ON(v9inode->fscache)) return; + version = cpu_to_le32(v9inode->qid.version); + path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, - &v9fs_cache_inode_index_def, - &v9inode->qid.path, - sizeof(v9inode->qid.path), - &v9inode->qid.version, - sizeof(v9inode->qid.version), - v9inode, - i_size_read(&v9inode->vfs_inode), - true); + v9inode->fscache = + fscache_acquire_cookie(v9fs_session_cache(v9ses), + 0, + &path, sizeof(path), + &version, sizeof(version), + i_size_read(&v9inode->vfs_inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); } - -void v9fs_cache_inode_put_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", - inode, v9inode->fscache); - - fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version, - false); - v9inode->fscache = NULL; -} - -void v9fs_cache_inode_flush_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", - inode, v9inode->fscache); - - fscache_relinquish_cookie(v9inode->fscache, NULL, true); - v9inode->fscache = NULL; -} - -void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - - mutex_lock(&v9inode->fscache_lock); - - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - v9fs_cache_inode_flush_cookie(inode); - else - v9fs_cache_inode_get_cookie(inode); - - mutex_unlock(&v9inode->fscache_lock); -} - -void v9fs_cache_inode_reset_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - struct v9fs_session_info *v9ses; - struct fscache_cookie *old; - - if (!v9inode->fscache) - return; - - old = v9inode->fscache; - - mutex_lock(&v9inode->fscache_lock); - fscache_relinquish_cookie(v9inode->fscache, NULL, true); - - v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, - &v9fs_cache_inode_index_def, - &v9inode->qid.path, - sizeof(v9inode->qid.path), - &v9inode->qid.version, - sizeof(v9inode->qid.version), - v9inode, - i_size_read(&v9inode->vfs_inode), - true); - p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", - inode, old, v9inode->fscache); - - mutex_unlock(&v9inode->fscache_lock); -} diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 7480b4b49fea..1923affcdc62 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -7,26 +7,15 @@ #ifndef _9P_CACHE_H #define _9P_CACHE_H -#define FSCACHE_USE_NEW_IO_API + #include <linux/fscache.h> #ifdef CONFIG_9P_FSCACHE -extern struct fscache_netfs v9fs_cache_netfs; -extern const struct fscache_cookie_def v9fs_cache_session_index_def; -extern const struct fscache_cookie_def v9fs_cache_inode_index_def; - -extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses); -extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses); +extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, + const char *dev_name); extern void v9fs_cache_inode_get_cookie(struct inode *inode); -extern void v9fs_cache_inode_put_cookie(struct inode *inode); -extern void v9fs_cache_inode_flush_cookie(struct inode *inode); -extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp); -extern void v9fs_cache_inode_reset_cookie(struct inode *inode); - -extern int __v9fs_cache_register(void); -extern void __v9fs_cache_unregister(void); #else /* CONFIG_9P_FSCACHE */ @@ -34,13 +23,5 @@ static inline void v9fs_cache_inode_get_cookie(struct inode *inode) { } -static inline void v9fs_cache_inode_put_cookie(struct inode *inode) -{ -} - -static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file) -{ -} - #endif /* CONFIG_9P_FSCACHE */ #endif /* _9P_CACHE_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 6aab046c98e2..79df61fe0e59 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -96,12 +96,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) dentry, dentry, from_kuid(&init_user_ns, uid), any); ret = NULL; - - if (d_inode(dentry)) - ret = v9fs_fid_find_inode(d_inode(dentry), uid); - /* we'll recheck under lock if there's anything to look in */ - if (!ret && dentry->d_fsdata) { + if (dentry->d_fsdata) { struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; spin_lock(&dentry->d_lock); @@ -113,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); + } else { + if (dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, uid); } return ret; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e32dd5f7721b..08f65c40af4f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -469,7 +469,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - v9fs_cache_session_get_cookie(v9ses); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + rc = v9fs_cache_session_get_cookie(v9ses, dev_name); + if (rc < 0) + goto err_clnt; + } #endif spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); @@ -502,8 +506,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) } #ifdef CONFIG_9P_FSCACHE - if (v9ses->fscache) - v9fs_cache_session_put_cookie(v9ses); + fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false); kfree(v9ses->cachetag); #endif kfree(v9ses->uname); @@ -665,20 +668,12 @@ static int v9fs_cache_register(void) ret = v9fs_init_inode_cache(); if (ret < 0) return ret; -#ifdef CONFIG_9P_FSCACHE - ret = fscache_register_netfs(&v9fs_cache_netfs); - if (ret < 0) - v9fs_destroy_inode_cache(); -#endif return ret; } static void v9fs_cache_unregister(void) { v9fs_destroy_inode_cache(); -#ifdef CONFIG_9P_FSCACHE - fscache_unregister_netfs(&v9fs_cache_netfs); -#endif } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 1647a8e63671..bc8b30205d36 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -89,7 +89,7 @@ struct v9fs_session_info { unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; - struct fscache_cookie *fscache; + struct fscache_volume *fscache; #endif char *uname; /* user name to mount as */ @@ -109,7 +109,6 @@ struct v9fs_session_info { struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE - struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; @@ -133,6 +132,16 @@ static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inod #endif } +static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info *v9ses) +{ +#ifdef CONFIG_9P_FSCACHE + return v9ses->fscache; +#else + return NULL; +#endif +} + + extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index fac918ccb305..9a10e68c5f30 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/swap.h> #include <linux/uio.h> #include <linux/netfs.h> #include <net/9p/9p.h> @@ -42,6 +43,11 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); total = p9_client_read(fid, pos, &to, &err); + + /* if we just extended the file size, any portion not in + * cache won't be on server and is zeroes */ + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, err ?: total, false); } @@ -78,7 +84,7 @@ static bool v9fs_is_cache_enabled(struct inode *inode) { struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode)); - return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); + return fscache_cookie_enabled(cookie) && cookie->cache_priv; } /** @@ -87,9 +93,13 @@ static bool v9fs_is_cache_enabled(struct inode *inode) */ static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) { +#ifdef CONFIG_9P_FSCACHE struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - return fscache_begin_read_operation(rreq, cookie); + return fscache_begin_read_operation(&rreq->cache_resources, cookie); +#else + return -ENOBUFS; +#endif } static const struct netfs_read_request_ops v9fs_req_ops = { @@ -133,16 +143,18 @@ static void v9fs_vfs_readahead(struct readahead_control *ractl) static int v9fs_release_page(struct page *page, gfp_t gfp) { struct folio *folio = page_folio(page); + struct inode *inode = folio_inode(folio); if (folio_test_private(folio)) return 0; #ifdef CONFIG_9P_FSCACHE if (folio_test_fscache(folio)) { - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return 0; folio_wait_fscache(folio); } #endif + fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode))); return 1; } @@ -161,10 +173,25 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset, folio_wait_fscache(folio); } +static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct v9fs_inode *v9inode = priv; + __le32 version; + + if (IS_ERR_VALUE(transferred_or_error) && + transferred_or_error != -ENOBUFS) { + version = cpu_to_le32(v9inode->qid.version); + fscache_invalidate(v9fs_inode_cookie(v9inode), &version, + i_size_read(&v9inode->vfs_inode), 0); + } +} + static int v9fs_vfs_write_folio_locked(struct folio *folio) { struct inode *inode = folio_inode(folio); struct v9fs_inode *v9inode = V9FS_I(inode); + struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode); loff_t start = folio_pos(folio); loff_t i_size = i_size_read(inode); struct iov_iter from; @@ -181,10 +208,21 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); + folio_wait_fscache(folio); folio_start_writeback(folio); p9_client_write(v9inode->writeback_fid, start, &from, &err); + if (err == 0 && + fscache_cookie_enabled(cookie) && + test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { + folio_start_fscache(folio); + fscache_write_to_cache(v9fs_inode_cookie(v9inode), + folio_mapping(folio), start, len, i_size, + v9fs_write_to_cache_done, v9inode, + true); + } + folio_end_writeback(folio); return err; } @@ -303,6 +341,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct folio *folio = page_folio(subpage); struct inode *inode = mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); @@ -322,6 +361,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, if (last_pos > inode->i_size) { inode_add_bytes(inode, last_pos - inode->i_size); i_size_write(inode, last_pos); + fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos); } folio_mark_dirty(folio); out: @@ -331,11 +371,25 @@ out: return copied; } +#ifdef CONFIG_9P_FSCACHE +/* + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. + */ +static int v9fs_set_page_dirty(struct page *page) +{ + struct v9fs_inode *v9inode = V9FS_I(page->mapping->host); + + return fscache_set_page_dirty(page, v9fs_inode_cookie(v9inode)); +} +#else +#define v9fs_set_page_dirty __set_page_dirty_nobuffers +#endif const struct address_space_operations v9fs_addr_operations = { .readpage = v9fs_vfs_readpage, .readahead = v9fs_vfs_readahead, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = v9fs_set_page_dirty, .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, .write_end = v9fs_write_end, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 8c854d8cb0cd..958680f7f23e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -17,6 +17,7 @@ #include <linux/idr.h> #include <linux/slab.h> #include <linux/uio.h> +#include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -205,7 +206,10 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) int v9fs_dir_release(struct inode *inode, struct file *filp) { + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid; + __le32 version; + loff_t i_size; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", @@ -216,6 +220,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_unlock(&inode->i_lock); p9_client_clunk(fid); } + + if ((filp->f_mode & FMODE_WRITE)) { + version = cpu_to_le32(v9inode->qid.version); + i_size = i_size_read(inode); + fscache_unuse_cookie(v9fs_inode_cookie(v9inode), + &version, &i_size); + } else { + fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL); + } return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 612e297f3763..2573c08f335c 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -93,7 +93,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) } mutex_unlock(&v9inode->v_mutex); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(inode, file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, fid); return 0; out_error: @@ -114,7 +115,6 @@ out_error: static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { - int res = 0; struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); @@ -124,7 +124,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) invalidate_mapping_pages(&inode->i_data, 0, -1); } - return res; + return 0; } static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) @@ -139,8 +139,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) fid = filp->private_data; BUG_ON(fid == NULL); - if ((fl->fl_flags & FL_POSIX) != FL_POSIX) - BUG(); + BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 328c338ff304..2a10242c79c7 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -233,7 +233,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) return NULL; #ifdef CONFIG_9P_FSCACHE v9inode->fscache = NULL; - mutex_init(&v9inode->fscache_lock); #endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; @@ -381,12 +380,16 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); + __le32 version; truncate_inode_pages_final(&inode->i_data); + version = cpu_to_le32(v9inode->qid.version); + fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, + &version); clear_inode(inode); filemap_fdatawrite(&inode->i_data); - v9fs_cache_inode_put_cookie(inode); + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); /* clunk the fid stashed in writeback_fid */ if (v9inode->writeback_fid) { p9_client_clunk(v9inode->writeback_fid); @@ -869,7 +872,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, file->private_data = fid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(d_inode(dentry), file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, fid); file->f_mode |= FMODE_CREATED; @@ -1072,6 +1076,8 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; + struct inode *inode = d_inode(dentry); + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL; struct p9_wstat wstat; @@ -1117,7 +1123,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, /* Write all dirty data */ if (d_is_reg(dentry)) - filemap_write_and_wait(d_inode(dentry)->i_mapping); + filemap_write_and_wait(inode->i_mapping); retval = p9_client_wstat(fid, &wstat); @@ -1128,13 +1134,15 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(d_inode(dentry))) - truncate_setsize(d_inode(dentry), iattr->ia_size); + iattr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, iattr->ia_size); + fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + } - v9fs_invalidate_inode_attr(d_inode(dentry)); + v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, d_inode(dentry), iattr); - mark_inode_dirty(d_inode(dentry)); + setattr_copy(&init_user_ns, inode, iattr); + mark_inode_dirty(inode); return 0; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 7dee89ba32e7..d17502a738a9 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -344,7 +344,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, goto err_clunk_old_fid; file->private_data = ofid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(inode, file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, ofid); file->f_mode |= FMODE_CREATED; out: @@ -551,7 +552,10 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, { int retval, use_dentry = 0; struct p9_fid *fid = NULL; - struct p9_iattr_dotl p9attr; + struct p9_iattr_dotl p9attr = { + .uid = INVALID_UID, + .gid = INVALID_GID, + }; struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -561,14 +565,22 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, return retval; p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); - p9attr.mode = iattr->ia_mode; - p9attr.uid = iattr->ia_uid; - p9attr.gid = iattr->ia_gid; - p9attr.size = iattr->ia_size; - p9attr.atime_sec = iattr->ia_atime.tv_sec; - p9attr.atime_nsec = iattr->ia_atime.tv_nsec; - p9attr.mtime_sec = iattr->ia_mtime.tv_sec; - p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + if (iattr->ia_valid & ATTR_MODE) + p9attr.mode = iattr->ia_mode; + if (iattr->ia_valid & ATTR_UID) + p9attr.uid = iattr->ia_uid; + if (iattr->ia_valid & ATTR_GID) + p9attr.gid = iattr->ia_gid; + if (iattr->ia_valid & ATTR_SIZE) + p9attr.size = iattr->ia_size; + if (iattr->ia_valid & ATTR_ATIME_SET) { + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME_SET) { + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + } if (iattr->ia_valid & ATTR_FILE) { fid = iattr->ia_file->private_data; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b739e02f5ef7..97e23b4e6982 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/magic.h> +#include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -309,6 +310,7 @@ static int v9fs_write_inode(struct inode *inode, __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } @@ -332,6 +334,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } diff --git a/fs/Kconfig b/fs/Kconfig index a6313a969bc5..6c7dc1387beb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER Enable this to perform validation of the parameter description for a filesystem when it is registered. -if BLOCK - config FS_IOMAP bool +if BLOCK + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" @@ -42,6 +42,8 @@ source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" source "fs/zonefs/Kconfig" +endif # BLOCK + config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU @@ -89,8 +91,6 @@ config FS_DAX_PMD config FS_DAX_LIMITED bool -endif # BLOCK - # Posix ACL utility routines # # Note: Posix ACLs can be implemented without these helpers. Never use @@ -369,8 +369,8 @@ source "fs/ksmbd/Kconfig" config SMBFS_COMMON tristate - default y if CIFS=y - default m if CIFS=m + default y if CIFS=y || SMB_SERVER=y + default m if CIFS=m || SMB_SERVER=m source "fs/coda/Kconfig" source "fs/afs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..208a74e0b00e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,6 +6,8 @@ # Rewritten to use lists instead of if-statements. # +obj-$(CONFIG_SYSCTL) += sysctls.o + obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o dcache.o inode.o \ @@ -94,7 +96,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ -obj-$(CONFIG_UNICODE) += unicode/ +obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ obj-$(CONFIG_CIFS) += cifs/ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index adbb3a1edcbf..5156821bfe6a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -355,7 +355,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; - int ret; obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; @@ -365,6 +364,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); - return ret; + return adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); } diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 75c4e4043d1d..e8956b65d7ff 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -3,10 +3,7 @@ # Makefile for Red Hat Linux AFS client. # -afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o - kafs-y := \ - $(afs-cache-y) \ addr_list.o \ callback.o \ cell.o \ diff --git a/fs/afs/cache.c b/fs/afs/cache.c deleted file mode 100644 index 037af93e3aba..000000000000 --- a/fs/afs/cache.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* AFS caching stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/sched.h> -#include "internal.h" - -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size); - -struct fscache_netfs afs_cache_netfs = { - .name = "afs", - .version = 2, -}; - -struct fscache_cookie_def afs_cell_cache_index_def = { - .name = "AFS.cell", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie_def afs_volume_cache_index_def = { - .name = "AFS.volume", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie_def afs_vnode_cache_index_def = { - .name = "AFS.vnode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = afs_vnode_cache_check_aux, -}; - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct afs_vnode_cache_aux aux; - - _enter("{%llx,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, buflen); - - memcpy(&aux, buffer, sizeof(aux)); - - /* check the size of the data is what we're expecting */ - if (buflen != sizeof(aux)) { - _leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux)); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - if (vnode->status.data_version != aux.data_version) { - _leave(" = OBSOLETE [vers %llx != %llx]", - aux.data_version, vnode->status.data_version); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = SUCCESS"); - return FSCACHE_CHECKAUX_OKAY; -} diff --git a/fs/afs/cell.c b/fs/afs/cell.c index d88407fb9bc0..07ad744eef77 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -680,13 +680,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) return ret; } -#ifdef CONFIG_AFS_FSCACHE - cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell->name, strlen(cell->name), - NULL, 0, - cell, 0, true); -#endif ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -723,11 +716,6 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(cell->cache, NULL, false); - cell->cache = NULL; -#endif - _leave(""); } diff --git a/fs/afs/file.c b/fs/afs/file.c index afe4b803f84b..720818a7c166 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -14,6 +14,7 @@ #include <linux/gfp.h> #include <linux/task_io_accounting_ops.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/netfs.h> #include "internal.h" @@ -158,7 +159,9 @@ int afs_open(struct inode *inode, struct file *file) if (file->f_flags & O_TRUNC) set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - + + fscache_use_cookie(afs_vnode_cache(vnode), file->f_mode & FMODE_WRITE); + file->private_data = af; _leave(" = 0"); return 0; @@ -177,8 +180,10 @@ error: */ int afs_release(struct inode *inode, struct file *file) { + struct afs_vnode_cache_aux aux; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = file->private_data; + loff_t i_size; int ret = 0; _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); @@ -189,6 +194,15 @@ int afs_release(struct inode *inode, struct file *file) file->private_data = NULL; if (af->wb) afs_put_wb_key(af->wb); + + if ((file->f_mode & FMODE_WRITE)) { + i_size = i_size_read(&vnode->vfs_inode); + afs_set_cache_aux(vnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); + } + key_put(af->key); kfree(af); afs_prune_wb_keys(vnode); @@ -354,14 +368,19 @@ static bool afs_is_cache_enabled(struct inode *inode) { struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); + return fscache_cookie_enabled(cookie) && cookie->cache_priv; } static int afs_begin_cache_operation(struct netfs_read_request *rreq) { +#ifdef CONFIG_AFS_FSCACHE struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode)); + return fscache_begin_read_operation(&rreq->cache_resources, + afs_vnode_cache(vnode)); +#else + return -ENOBUFS; +#endif } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, @@ -398,6 +417,12 @@ static void afs_readahead(struct readahead_control *ractl) netfs_readahead(ractl, &afs_req_ops, NULL); } +int afs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); + return 0; +} + /* * Adjust the dirty region of the page on truncation or full invalidation, * getting rid of the markers altogether if the region is entirely invalidated. @@ -480,23 +505,24 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, * release a page and clean up its private state if it's not busy * - return true if the page can now be released, false if not */ -static int afs_releasepage(struct page *page, gfp_t gfp_flags) +static int afs_releasepage(struct page *page, gfp_t gfp) { struct folio *folio = page_folio(page); struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp_flags); + gfp); /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE if (folio_test_fscache(folio)) { - if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; folio_wait_fscache(folio); } + fscache_note_page_release(afs_vnode_cache(vnode)); #endif if (folio_test_private(folio)) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 16906eb592d9..5964f8aee090 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -413,9 +413,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE struct { - u32 vnode_id; - u32 unique; - u32 vnode_id_ext[2]; /* Allow for a 96-bit key */ + __be32 vnode_id; + __be32 unique; + __be32 vnode_id_ext[2]; /* Allow for a 96-bit key */ } __packed key; struct afs_vnode_cache_aux aux; @@ -424,17 +424,18 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) return; } - key.vnode_id = vnode->fid.vnode; - key.unique = vnode->fid.unique; - key.vnode_id_ext[0] = vnode->fid.vnode >> 32; - key.vnode_id_ext[1] = vnode->fid.vnode_hi; - aux.data_version = vnode->status.data_version; - - vnode->cache = fscache_acquire_cookie(vnode->volume->cache, - &afs_vnode_cache_index_def, - &key, sizeof(key), - &aux, sizeof(aux), - vnode, vnode->status.size, true); + key.vnode_id = htonl(vnode->fid.vnode); + key.unique = htonl(vnode->fid.unique); + key.vnode_id_ext[0] = htonl(vnode->fid.vnode >> 32); + key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi); + afs_set_cache_aux(vnode, &aux); + + vnode->cache = fscache_acquire_cookie( + vnode->volume->cache, + vnode->status.type == AFS_FTYPE_FILE ? 0 : FSCACHE_ADV_SINGLE_CHUNK, + &key, sizeof(key), + &aux, sizeof(aux), + vnode->status.size); #endif } @@ -563,9 +564,7 @@ static void afs_zap_data(struct afs_vnode *vnode) { _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); -#ifdef CONFIG_AFS_FSCACHE - fscache_invalidate(vnode->cache); -#endif + afs_invalidate_cache(vnode, 0); /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a @@ -762,9 +761,8 @@ int afs_drop_inode(struct inode *inode) */ void afs_evict_inode(struct inode *inode) { - struct afs_vnode *vnode; - - vnode = AFS_FS_I(inode); + struct afs_vnode_cache_aux aux; + struct afs_vnode *vnode = AFS_FS_I(inode); _enter("{%llx:%llu.%d}", vnode->fid.vid, @@ -776,6 +774,9 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); truncate_inode_pages_final(&inode->i_data); + + afs_set_cache_aux(vnode, &aux); + fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -786,14 +787,9 @@ void afs_evict_inode(struct inode *inode) } #ifdef CONFIG_AFS_FSCACHE - { - struct afs_vnode_cache_aux aux; - - aux.data_version = vnode->status.data_version; - fscache_relinquish_cookie(vnode->cache, &aux, - test_bit(AFS_VNODE_DELETED, &vnode->flags)); - vnode->cache = NULL; - } + fscache_relinquish_cookie(vnode->cache, + test_bit(AFS_VNODE_DELETED, &vnode->flags)); + vnode->cache = NULL; #endif afs_prune_wb_keys(vnode); @@ -833,6 +829,9 @@ static void afs_setattr_edit_file(struct afs_operation *op) if (size < i_size) truncate_pagecache(inode, size); + if (size != i_size) + fscache_resize_cookie(afs_vnode_cache(vp->vnode), + vp->scb.status.size); } } @@ -849,40 +848,67 @@ static const struct afs_operation_ops afs_setattr_operation = { int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { + const unsigned int supported = + ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + struct inode *inode = &vnode->vfs_inode; + loff_t i_size; int ret; _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); - if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | - ATTR_TOUCH))) { + if (!(attr->ia_valid & supported)) { _leave(" = 0 [unsupported]"); return 0; } + i_size = i_size_read(inode); if (attr->ia_valid & ATTR_SIZE) { - if (!S_ISREG(vnode->vfs_inode.i_mode)) + if (!S_ISREG(inode->i_mode)) return -EISDIR; - ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size); + ret = inode_newsize_ok(inode, attr->ia_size); if (ret) return ret; - if (attr->ia_size == i_size_read(&vnode->vfs_inode)) + if (attr->ia_size == i_size) attr->ia_valid &= ~ATTR_SIZE; } - /* flush any dirty data outstanding on a regular file */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - filemap_write_and_wait(vnode->vfs_inode.i_mapping); + fscache_use_cookie(afs_vnode_cache(vnode), true); /* Prevent any new writebacks from starting whilst we do this. */ down_write(&vnode->validate_lock); + if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode)) { + loff_t size = attr->ia_size; + + /* Wait for any outstanding writes to the server to complete */ + loff_t from = min(size, i_size); + loff_t to = max(size, i_size); + ret = filemap_fdatawait_range(inode->i_mapping, from, to); + if (ret < 0) + goto out_unlock; + + /* Don't talk to the server if we're just shortening in-memory + * writes that haven't gone to the server yet. + */ + if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && + attr->ia_size < i_size && + attr->ia_size > vnode->status.size) { + truncate_pagecache(inode, attr->ia_size); + fscache_resize_cookie(afs_vnode_cache(vnode), + attr->ia_size); + i_size_write(inode, attr->ia_size); + ret = 0; + goto out_unlock; + } + } + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); @@ -907,6 +933,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, out_unlock: up_write(&vnode->validate_lock); + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index aa4c0d6c9780..b6f02321fc09 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -14,7 +14,6 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> -#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> @@ -364,9 +363,6 @@ struct afs_cell { struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ atomic_t ref; /* Struct refcount */ @@ -590,7 +586,7 @@ struct afs_volume { #define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ #define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ #ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ + struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ @@ -872,9 +868,24 @@ struct afs_operation { * Cache auxiliary data. */ struct afs_vnode_cache_aux { - u64 data_version; + __be64 data_version; } __packed; +static inline void afs_set_cache_aux(struct afs_vnode *vnode, + struct afs_vnode_cache_aux *aux) +{ + aux->data_version = cpu_to_be64(vnode->status.data_version); +} + +static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) +{ + struct afs_vnode_cache_aux aux; + + afs_set_cache_aux(vnode, &aux); + fscache_invalidate(afs_vnode_cache(vnode), &aux, + i_size_read(&vnode->vfs_inode), flags); +} + /* * We use folio->private to hold the amount of the folio that we've written to, * splitting the field into two parts. However, we need to represent a range @@ -962,13 +973,6 @@ extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); */ #ifdef CONFIG_AFS_FSCACHE extern struct fscache_netfs afs_cache_netfs; -extern struct fscache_cookie_def afs_cell_cache_index_def; -extern struct fscache_cookie_def afs_volume_cache_index_def; -extern struct fscache_cookie_def afs_vnode_cache_index_def; -#else -#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) #endif /* @@ -1068,6 +1072,7 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); +extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1506,7 +1511,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); -extern void afs_activate_volume(struct afs_volume *); +extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); @@ -1515,7 +1520,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ +#ifdef CONFIG_AFS_FSCACHE extern int afs_set_page_dirty(struct page *); +#else +#define afs_set_page_dirty __set_page_dirty_nobuffers +#endif extern int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); diff --git a/fs/afs/main.c b/fs/afs/main.c index 179004b15566..eae288c8d40a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -186,13 +186,6 @@ static int __init afs_init(void) if (!afs_lock_manager) goto error_lockmgr; -#ifdef CONFIG_AFS_FSCACHE - /* we want to be able to cache */ - ret = fscache_register_netfs(&afs_cache_netfs); - if (ret < 0) - goto error_cache; -#endif - ret = register_pernet_device(&afs_net_ops); if (ret < 0) goto error_net; @@ -215,10 +208,6 @@ error_proc: error_fs: unregister_pernet_device(&afs_net_ops); error_net: -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -error_cache: -#endif destroy_workqueue(afs_lock_manager); error_lockmgr: destroy_workqueue(afs_async_calls); @@ -245,9 +234,6 @@ static void __exit afs_exit(void) proc_remove(afs_proc_symlink); afs_fs_exit(); unregister_pernet_device(&afs_net_ops); -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -#endif destroy_workqueue(afs_lock_manager); destroy_workqueue(afs_async_calls); destroy_workqueue(afs_wq); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 065a28bfa3f1..e1b863449296 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); rcu_read_lock(); return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); @@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); } @@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) { struct afs_vl_seq_net_private *priv = m->private; struct afs_vlserver_list *vllist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); diff --git a/fs/afs/super.c b/fs/afs/super.c index 34c68724c98b..5ec9fd97eccc 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,6 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, + .write_inode = afs_write_inode, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, diff --git a/fs/afs/volume.c b/fs/afs/volume.c index f84194b791d3..94a3d247924b 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -268,15 +268,30 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume, /* * Activate a volume. */ -void afs_activate_volume(struct afs_volume *volume) +int afs_activate_volume(struct afs_volume *volume) { #ifdef CONFIG_AFS_FSCACHE - volume->cache = fscache_acquire_cookie(volume->cell->cache, - &afs_volume_cache_index_def, - &volume->vid, sizeof(volume->vid), - NULL, 0, - volume, 0, true); + struct fscache_volume *vcookie; + char *name; + + name = kasprintf(GFP_KERNEL, "afs,%s,%llx", + volume->cell->name, volume->vid); + if (!name) + return -ENOMEM; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); + } + pr_err("AFS: Cache volume key already in use (%s)\n", name); + vcookie = NULL; + } + volume->cache = vcookie; + kfree(name); #endif + return 0; } /* @@ -287,7 +302,7 @@ void afs_deactivate_volume(struct afs_volume *volume) _enter("%s", volume->name); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, NULL, + fscache_relinquish_volume(volume->cache, NULL, test_bit(AFS_VOLUME_DELETED, &volume->flags)); volume->cache = NULL; #endif diff --git a/fs/afs/write.c b/fs/afs/write.c index ca4909baf5e6..5e9157d0da29 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,17 +12,30 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> -#include <linux/fscache.h> #include "internal.h" +static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, + loff_t i_size, bool caching); + +#ifdef CONFIG_AFS_FSCACHE /* - * mark a page as having been made dirty and thus needing writeback + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. */ int afs_set_page_dirty(struct page *page) { - _enter(""); - return __set_page_dirty_nobuffers(page); + return fscache_set_page_dirty(page, afs_vnode_cache(AFS_FS_I(page->mapping->host))); +} +static void afs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void afs_folio_start_fscache(bool caching, struct folio *folio) +{ } +#endif /* * prepare to perform part of a write to a page @@ -114,7 +127,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, unsigned long priv; unsigned int f, from = offset_in_folio(folio, pos); unsigned int t, to = from + copied; - loff_t i_size, maybe_i_size; + loff_t i_size, write_end_pos; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); @@ -131,15 +144,16 @@ int afs_write_end(struct file *file, struct address_space *mapping, if (copied == 0) goto out; - maybe_i_size = pos + copied; + write_end_pos = pos + copied; i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) { + if (write_end_pos > i_size) { write_seqlock(&vnode->cb_lock); i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) - afs_set_i_size(vnode, maybe_i_size); + if (write_end_pos > i_size) + afs_set_i_size(vnode, write_end_pos); write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); } if (folio_test_private(folio)) { @@ -418,6 +432,7 @@ static void afs_extend_writeback(struct address_space *mapping, loff_t start, loff_t max_len, bool new_content, + bool caching, unsigned int *_len) { struct pagevec pvec; @@ -464,7 +479,9 @@ static void afs_extend_writeback(struct address_space *mapping, folio_put(folio); break; } - if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { folio_unlock(folio); folio_put(folio); break; @@ -512,6 +529,7 @@ static void afs_extend_writeback(struct address_space *mapping, BUG(); if (folio_start_writeback(folio)) BUG(); + afs_folio_start_fscache(caching, folio); *_count -= folio_nr_pages(folio); folio_unlock(folio); @@ -539,6 +557,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, unsigned int offset, to, len, max_len; loff_t i_size = i_size_read(&vnode->vfs_inode); bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; int ret; @@ -546,6 +565,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (folio_start_writeback(folio)) BUG(); + afs_folio_start_fscache(caching, folio); count -= folio_nr_pages(folio); @@ -572,7 +592,8 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (len < max_len && (to == folio_size(folio) || new_content)) afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, &len); + start, max_len, new_content, + caching, &len); len = min_t(loff_t, len, max_len); } @@ -585,12 +606,19 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (start < i_size) { _debug("write back %x @%llx [%llx]", len, start, i_size); + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + afs_write_to_cache(vnode, start, len, i_size, caching); + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { _debug("write discard %x @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(afs_vnode_cache(vnode), + mapping, start, len, caching); afs_pages_written_back(vnode, start, len); ret = 0; } @@ -649,6 +677,10 @@ int afs_writepage(struct page *subpage, struct writeback_control *wbc) _enter("{%lx},", folio_index(folio)); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + start = folio_index(folio) * PAGE_SIZE; ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc, folio, start, LLONG_MAX - start); @@ -714,10 +746,15 @@ static int afs_writepages_region(struct address_space *mapping, continue; } - if (folio_test_writeback(folio)) { + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) + if (wbc->sync_mode != WB_SYNC_NONE) { folio_wait_writeback(folio); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + } folio_put(folio); continue; } @@ -970,3 +1007,28 @@ int afs_launder_page(struct page *subpage) folio_wait_fscache(folio); return ret; } + +/* + * Deal with the completion of writing the data to the cache. + */ +static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct afs_vnode *vnode = priv; + + if (IS_ERR_VALUE(transferred_or_error) && + transferred_or_error != -ENOBUFS) + afs_invalidate_cache(vnode, 0); +} + +/* + * Save the write to the cache also. + */ +static void afs_write_to_cache(struct afs_vnode *vnode, + loff_t start, size_t len, loff_t i_size, + bool caching) +{ + fscache_write_to_cache(afs_vnode_cache(vnode), + vnode->vfs_inode.i_mapping, start, len, i_size, + afs_write_to_cache_done, vnode, caching); +} @@ -220,9 +220,35 @@ struct aio_kiocb { /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); -unsigned long aio_nr; /* current system wide number of aio requests */ -unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +static unsigned long aio_nr; /* current system wide number of aio requests */ +static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ +#ifdef CONFIG_SYSCTL +static struct ctl_table aio_sysctls[] = { + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + {} +}; + +static void __init aio_sysctl_init(void) +{ + register_sysctl_init("fs", aio_sysctls); +} +#else +#define aio_sysctl_init() do { } while (0) +#endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; @@ -275,6 +301,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + aio_sysctl_init(); return 0; } __initcall(aio_setup); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8c7f26f1fbb..9e11e6f13e83 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1116,11 +1116,11 @@ out_free_interp: * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - if (interpreter) { + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (interpreter || alignment > ELF_MIN_ALIGN) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; @@ -1585,7 +1585,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 520a0f6a7d9e..183e5c4aed34 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,8 +18,7 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU - depends on !PPC_256K_PAGES # powerpc - depends on !PAGE_SIZE_256KB # hexagon + depends on PAGE_SIZE_LESS_THAN_256KB help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..409bad3928db 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -12,7 +12,6 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/prefetch.h> -#include <linux/cleancache.h> #include <linux/fsverity.h> #include "misc.h" #include "extent_io.h" @@ -3578,15 +3577,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, goto out; } - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - unlock_extent(tree, start, end); - unlock_page(page); - goto out; - } - } - if (page->index == last_byte >> PAGE_SHIFT) { size_t zero_offset = offset_in_page(last_byte); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1398d7b64c4e..8d47ec5fc4f4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3372,10 +3372,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); - if (!err) { - fsnotify_rmdir(dir, dentry); - d_delete(dentry); - } + if (!err) + d_delete_notify(dir, dentry); out_dput: dput(dentry); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0ec09fe01be6..4d947ba32da9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -23,7 +23,6 @@ #include <linux/miscdevice.h> #include <linux/magic.h> #include <linux/slab.h> -#include <linux/cleancache.h> #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> @@ -1374,7 +1373,6 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - cleancache_init_fs(sb); sb->s_flags |= SB_ACTIVE; return 0; diff --git a/fs/buffer.c b/fs/buffer.c index 46bc589b7a03..8e112b6bd371 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1969,34 +1969,34 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } } -int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, +int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - head = create_page_buffers(page, inode, 0); + head = create_page_buffers(&folio->page, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } @@ -2016,20 +2016,20 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, if (buffer_new(bh)) { clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, + folio_zero_segments(folio, to, block_end, block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); continue; @@ -2050,14 +2050,15 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); return err; } int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { - return __block_write_begin_int(page, pos, len, get_block, NULL); + return __block_write_begin_int(page_folio(page), pos, len, get_block, + NULL); } EXPORT_SYMBOL(__block_write_begin); diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 6827b40f7ddc..719faeeda168 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -19,3 +19,10 @@ config CACHEFILES_DEBUG caching on files module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/cachefiles/parameter/debug or by including a debugging specifier in /etc/cachefilesd.conf. + +config CACHEFILES_ERROR_INJECTION + bool "Provide error injection for cachefiles" + depends on CACHEFILES && SYSCTL + help + This permits error injection to be enabled in cachefiles whilst a + cache is in service. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 02fd17731769..16d811f1a2fa 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -4,15 +4,17 @@ # cachefiles-y := \ - bind.o \ + cache.o \ daemon.o \ interface.o \ io.o \ key.o \ main.o \ namei.o \ - rdwr.o \ security.o \ + volume.o \ xattr.o +cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o + obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c deleted file mode 100644 index d463d89f5db8..000000000000 --- a/fs/cachefiles/bind.c +++ /dev/null @@ -1,278 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Bind and unbind a cache from the filesystem backing it - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/statfs.h> -#include <linux/ctype.h> -#include <linux/xattr.h> -#include "internal.h" - -static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); - -/* - * bind a directory as a cache - */ -int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) -{ - _enter("{%u,%u,%u,%u,%u,%u},%s", - cache->frun_percent, - cache->fcull_percent, - cache->fstop_percent, - cache->brun_percent, - cache->bcull_percent, - cache->bstop_percent, - args); - - /* start by checking things over */ - ASSERT(cache->fstop_percent >= 0 && - cache->fstop_percent < cache->fcull_percent && - cache->fcull_percent < cache->frun_percent && - cache->frun_percent < 100); - - ASSERT(cache->bstop_percent >= 0 && - cache->bstop_percent < cache->bcull_percent && - cache->bcull_percent < cache->brun_percent && - cache->brun_percent < 100); - - if (*args) { - pr_err("'bind' command doesn't take an argument\n"); - return -EINVAL; - } - - if (!cache->rootdirname) { - pr_err("No cache directory specified\n"); - return -EINVAL; - } - - /* don't permit already bound caches to be re-bound */ - if (test_bit(CACHEFILES_READY, &cache->flags)) { - pr_err("Cache already bound\n"); - return -EBUSY; - } - - /* make sure we have copies of the tag and dirname strings */ - if (!cache->tag) { - /* the tag string is released by the fops->release() - * function, so we don't release it on error here */ - cache->tag = kstrdup("CacheFiles", GFP_KERNEL); - if (!cache->tag) - return -ENOMEM; - } - - /* add the cache */ - return cachefiles_daemon_add_cache(cache); -} - -/* - * add a cache - */ -static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) -{ - struct cachefiles_object *fsdef; - struct path path; - struct kstatfs stats; - struct dentry *graveyard, *cachedir, *root; - const struct cred *saved_cred; - int ret; - - _enter(""); - - /* we want to work under the module's security ID */ - ret = cachefiles_get_security_ID(cache); - if (ret < 0) - return ret; - - cachefiles_begin_secure(cache, &saved_cred); - - /* allocate the root index object */ - ret = -ENOMEM; - - fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); - if (!fsdef) - goto error_root_object; - - ASSERTCMP(fsdef->backer, ==, NULL); - - atomic_set(&fsdef->usage, 1); - fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; - - /* look up the directory at the root of the cache */ - ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); - if (ret < 0) - goto error_open_root; - - cache->mnt = path.mnt; - root = path.dentry; - - ret = -EINVAL; - if (mnt_user_ns(path.mnt) != &init_user_ns) { - pr_warn("File cache on idmapped mounts not supported"); - goto error_unsupported; - } - - /* check parameters */ - ret = -EOPNOTSUPP; - if (d_is_negative(root) || - !d_backing_inode(root)->i_op->lookup || - !d_backing_inode(root)->i_op->mkdir || - !(d_backing_inode(root)->i_opflags & IOP_XATTR) || - !root->d_sb->s_op->statfs || - !root->d_sb->s_op->sync_fs) - goto error_unsupported; - - ret = -EROFS; - if (sb_rdonly(root->d_sb)) - goto error_unsupported; - - /* determine the security of the on-disk cache as this governs - * security ID of files we create */ - ret = cachefiles_determine_cache_security(cache, root, &saved_cred); - if (ret < 0) - goto error_unsupported; - - /* get the cache size and blocksize */ - ret = vfs_statfs(&path, &stats); - if (ret < 0) - goto error_unsupported; - - ret = -ERANGE; - if (stats.f_bsize <= 0) - goto error_unsupported; - - ret = -EOPNOTSUPP; - if (stats.f_bsize > PAGE_SIZE) - goto error_unsupported; - - cache->bsize = stats.f_bsize; - cache->bshift = 0; - if (stats.f_bsize < PAGE_SIZE) - cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); - - _debug("blksize %u (shift %u)", - cache->bsize, cache->bshift); - - _debug("size %llu, avail %llu", - (unsigned long long) stats.f_blocks, - (unsigned long long) stats.f_bavail); - - /* set up caching limits */ - do_div(stats.f_files, 100); - cache->fstop = stats.f_files * cache->fstop_percent; - cache->fcull = stats.f_files * cache->fcull_percent; - cache->frun = stats.f_files * cache->frun_percent; - - _debug("limits {%llu,%llu,%llu} files", - (unsigned long long) cache->frun, - (unsigned long long) cache->fcull, - (unsigned long long) cache->fstop); - - stats.f_blocks >>= cache->bshift; - do_div(stats.f_blocks, 100); - cache->bstop = stats.f_blocks * cache->bstop_percent; - cache->bcull = stats.f_blocks * cache->bcull_percent; - cache->brun = stats.f_blocks * cache->brun_percent; - - _debug("limits {%llu,%llu,%llu} blocks", - (unsigned long long) cache->brun, - (unsigned long long) cache->bcull, - (unsigned long long) cache->bstop); - - /* get the cache directory and check its type */ - cachedir = cachefiles_get_directory(cache, root, "cache"); - if (IS_ERR(cachedir)) { - ret = PTR_ERR(cachedir); - goto error_unsupported; - } - - fsdef->dentry = cachedir; - fsdef->fscache.cookie = NULL; - - ret = cachefiles_check_object_type(fsdef); - if (ret < 0) - goto error_unsupported; - - /* get the graveyard directory */ - graveyard = cachefiles_get_directory(cache, root, "graveyard"); - if (IS_ERR(graveyard)) { - ret = PTR_ERR(graveyard); - goto error_unsupported; - } - - cache->graveyard = graveyard; - - /* publish the cache */ - fscache_init_cache(&cache->cache, - &cachefiles_cache_ops, - "%s", - fsdef->dentry->d_sb->s_id); - - fscache_object_init(&fsdef->fscache, &fscache_fsdef_index, - &cache->cache); - - ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); - if (ret < 0) - goto error_add_cache; - - /* done */ - set_bit(CACHEFILES_READY, &cache->flags); - dput(root); - - pr_info("File cache on %s registered\n", cache->cache.identifier); - - /* check how much space the cache has */ - cachefiles_has_space(cache, 0, 0); - cachefiles_end_secure(cache, saved_cred); - return 0; - -error_add_cache: - dput(cache->graveyard); - cache->graveyard = NULL; -error_unsupported: - mntput(cache->mnt); - cache->mnt = NULL; - dput(fsdef->dentry); - fsdef->dentry = NULL; - dput(root); -error_open_root: - kmem_cache_free(cachefiles_object_jar, fsdef); -error_root_object: - cachefiles_end_secure(cache, saved_cred); - pr_err("Failed to register: %d\n", ret); - return ret; -} - -/* - * unbind a cache on fd release - */ -void cachefiles_daemon_unbind(struct cachefiles_cache *cache) -{ - _enter(""); - - if (test_bit(CACHEFILES_READY, &cache->flags)) { - pr_info("File cache on %s unregistering\n", - cache->cache.identifier); - - fscache_withdraw_cache(&cache->cache); - } - - dput(cache->graveyard); - mntput(cache->mnt); - - kfree(cache->rootdirname); - kfree(cache->secctx); - kfree(cache->tag); - - _leave(""); -} diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c new file mode 100644 index 000000000000..7077f72e6f47 --- /dev/null +++ b/fs/cachefiles/cache.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Manage high-level VFS aspects of a cache. + * + * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <linux/statfs.h> +#include <linux/namei.h> +#include "internal.h" + +/* + * Bring a cache online. + */ +int cachefiles_add_cache(struct cachefiles_cache *cache) +{ + struct fscache_cache *cache_cookie; + struct path path; + struct kstatfs stats; + struct dentry *graveyard, *cachedir, *root; + const struct cred *saved_cred; + int ret; + + _enter(""); + + cache_cookie = fscache_acquire_cache(cache->tag); + if (IS_ERR(cache_cookie)) + return PTR_ERR(cache_cookie); + + /* we want to work under the module's security ID */ + ret = cachefiles_get_security_ID(cache); + if (ret < 0) + goto error_getsec; + + cachefiles_begin_secure(cache, &saved_cred); + + /* look up the directory at the root of the cache */ + ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); + if (ret < 0) + goto error_open_root; + + cache->mnt = path.mnt; + root = path.dentry; + + ret = -EINVAL; + if (is_idmapped_mnt(path.mnt)) { + pr_warn("File cache on idmapped mounts not supported"); + goto error_unsupported; + } + + /* Check features of the backing filesystem: + * - Directories must support looking up and directory creation + * - We create tmpfiles to handle invalidation + * - We use xattrs to store metadata + * - We need to be able to query the amount of space available + * - We want to be able to sync the filesystem when stopping the cache + * - We use DIO to/from pages, so the blocksize mustn't be too big. + */ + ret = -EOPNOTSUPP; + if (d_is_negative(root) || + !d_backing_inode(root)->i_op->lookup || + !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->tmpfile || + !(d_backing_inode(root)->i_opflags & IOP_XATTR) || + !root->d_sb->s_op->statfs || + !root->d_sb->s_op->sync_fs || + root->d_sb->s_blocksize > PAGE_SIZE) + goto error_unsupported; + + ret = -EROFS; + if (sb_rdonly(root->d_sb)) + goto error_unsupported; + + /* determine the security of the on-disk cache as this governs + * security ID of files we create */ + ret = cachefiles_determine_cache_security(cache, root, &saved_cred); + if (ret < 0) + goto error_unsupported; + + /* get the cache size and blocksize */ + ret = vfs_statfs(&path, &stats); + if (ret < 0) + goto error_unsupported; + + ret = -ERANGE; + if (stats.f_bsize <= 0) + goto error_unsupported; + + ret = -EOPNOTSUPP; + if (stats.f_bsize > PAGE_SIZE) + goto error_unsupported; + + cache->bsize = stats.f_bsize; + cache->bshift = ilog2(stats.f_bsize); + + _debug("blksize %u (shift %u)", + cache->bsize, cache->bshift); + + _debug("size %llu, avail %llu", + (unsigned long long) stats.f_blocks, + (unsigned long long) stats.f_bavail); + + /* set up caching limits */ + do_div(stats.f_files, 100); + cache->fstop = stats.f_files * cache->fstop_percent; + cache->fcull = stats.f_files * cache->fcull_percent; + cache->frun = stats.f_files * cache->frun_percent; + + _debug("limits {%llu,%llu,%llu} files", + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop); + + do_div(stats.f_blocks, 100); + cache->bstop = stats.f_blocks * cache->bstop_percent; + cache->bcull = stats.f_blocks * cache->bcull_percent; + cache->brun = stats.f_blocks * cache->brun_percent; + + _debug("limits {%llu,%llu,%llu} blocks", + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop); + + /* get the cache directory and check its type */ + cachedir = cachefiles_get_directory(cache, root, "cache", NULL); + if (IS_ERR(cachedir)) { + ret = PTR_ERR(cachedir); + goto error_unsupported; + } + + cache->store = cachedir; + + /* get the graveyard directory */ + graveyard = cachefiles_get_directory(cache, root, "graveyard", NULL); + if (IS_ERR(graveyard)) { + ret = PTR_ERR(graveyard); + goto error_unsupported; + } + + cache->graveyard = graveyard; + cache->cache = cache_cookie; + + ret = fscache_add_cache(cache_cookie, &cachefiles_cache_ops, cache); + if (ret < 0) + goto error_add_cache; + + /* done */ + set_bit(CACHEFILES_READY, &cache->flags); + dput(root); + + pr_info("File cache on %s registered\n", cache_cookie->name); + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check); + cachefiles_end_secure(cache, saved_cred); + _leave(" = 0 [%px]", cache->cache); + return 0; + +error_add_cache: + cachefiles_put_directory(cache->graveyard); + cache->graveyard = NULL; +error_unsupported: + cachefiles_put_directory(cache->store); + cache->store = NULL; + mntput(cache->mnt); + cache->mnt = NULL; + dput(root); +error_open_root: + cachefiles_end_secure(cache, saved_cred); +error_getsec: + fscache_relinquish_cache(cache_cookie); + cache->cache = NULL; + pr_err("Failed to register: %d\n", ret); + return ret; +} + +/* + * See if we have space for a number of pages and/or a number of files in the + * cache + */ +int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr, + enum cachefiles_has_space_for reason) +{ + struct kstatfs stats; + u64 b_avail, b_writing; + int ret; + + struct path path = { + .mnt = cache->mnt, + .dentry = cache->mnt->mnt_root, + }; + + //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", + // (unsigned long long) cache->frun, + // (unsigned long long) cache->fcull, + // (unsigned long long) cache->fstop, + // (unsigned long long) cache->brun, + // (unsigned long long) cache->bcull, + // (unsigned long long) cache->bstop, + // fnr, bnr); + + /* find out how many pages of blockdev are available */ + memset(&stats, 0, sizeof(stats)); + + ret = vfs_statfs(&path, &stats); + if (ret < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(path.dentry), ret, + cachefiles_trace_statfs_error); + if (ret == -EIO) + cachefiles_io_error(cache, "statfs failed"); + _leave(" = %d", ret); + return ret; + } + + b_avail = stats.f_bavail; + b_writing = atomic_long_read(&cache->b_writing); + if (b_avail > b_writing) + b_avail -= b_writing; + else + b_avail = 0; + + //_debug("avail %llu,%llu", + // (unsigned long long)stats.f_ffree, + // (unsigned long long)b_avail); + + /* see if there is sufficient space */ + if (stats.f_ffree > fnr) + stats.f_ffree -= fnr; + else + stats.f_ffree = 0; + + if (b_avail > bnr) + b_avail -= bnr; + else + b_avail = 0; + + ret = -ENOBUFS; + if (stats.f_ffree < cache->fstop || + b_avail < cache->bstop) + goto stop_and_begin_cull; + + ret = 0; + if (stats.f_ffree < cache->fcull || + b_avail < cache->bcull) + goto begin_cull; + + if (test_bit(CACHEFILES_CULLING, &cache->flags) && + stats.f_ffree >= cache->frun && + b_avail >= cache->brun && + test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) + ) { + _debug("cease culling"); + cachefiles_state_changed(cache); + } + + //_leave(" = 0"); + return 0; + +stop_and_begin_cull: + switch (reason) { + case cachefiles_has_space_for_write: + fscache_count_no_write_space(); + break; + case cachefiles_has_space_for_create: + fscache_count_no_create_space(); + break; + default: + break; + } +begin_cull: + if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { + _debug("### CULL CACHE ###"); + cachefiles_state_changed(cache); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * Mark all the objects as being out of service and queue them all for cleanup. + */ +static void cachefiles_withdraw_objects(struct cachefiles_cache *cache) +{ + struct cachefiles_object *object; + unsigned int count = 0; + + _enter(""); + + spin_lock(&cache->object_list_lock); + + while (!list_empty(&cache->object_list)) { + object = list_first_entry(&cache->object_list, + struct cachefiles_object, cache_link); + cachefiles_see_object(object, cachefiles_obj_see_withdrawal); + list_del_init(&object->cache_link); + fscache_withdraw_cookie(object->cookie); + count++; + if ((count & 63) == 0) { + spin_unlock(&cache->object_list_lock); + cond_resched(); + spin_lock(&cache->object_list_lock); + } + } + + spin_unlock(&cache->object_list_lock); + _leave(" [%u objs]", count); +} + +/* + * Withdraw volumes. + */ +static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) +{ + _enter(""); + + for (;;) { + struct cachefiles_volume *volume = NULL; + + spin_lock(&cache->object_list_lock); + if (!list_empty(&cache->volumes)) { + volume = list_first_entry(&cache->volumes, + struct cachefiles_volume, cache_link); + list_del_init(&volume->cache_link); + } + spin_unlock(&cache->object_list_lock); + if (!volume) + break; + + cachefiles_withdraw_volume(volume); + } + + _leave(""); +} + +/* + * Sync a cache to backing disk. + */ +static void cachefiles_sync_cache(struct cachefiles_cache *cache) +{ + const struct cred *saved_cred; + int ret; + + _enter("%s", cache->cache->name); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disc */ + cachefiles_begin_secure(cache, &saved_cred); + down_read(&cache->mnt->mnt_sb->s_umount); + ret = sync_filesystem(cache->mnt->mnt_sb); + up_read(&cache->mnt->mnt_sb->s_umount); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) + cachefiles_io_error(cache, + "Attempt to sync backing fs superblock returned error %d", + ret); +} + +/* + * Withdraw cache objects. + */ +void cachefiles_withdraw_cache(struct cachefiles_cache *cache) +{ + struct fscache_cache *fscache = cache->cache; + + pr_info("File cache on %s unregistering\n", fscache->name); + + fscache_withdraw_cache(fscache); + + /* we now have to destroy all the active objects pertaining to this + * cache - which we do by passing them off to thread pool to be + * disposed of */ + cachefiles_withdraw_objects(cache); + fscache_wait_for_objects(fscache); + + cachefiles_withdraw_volumes(cache); + cachefiles_sync_cache(cache); + cache->cache = NULL; + fscache_relinquish_cache(fscache); +} diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 752c1e43416f..7ac04ee2c0a0 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* Daemon interface * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -41,6 +41,8 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bind(struct cachefiles_cache *, char *); +static void cachefiles_daemon_unbind(struct cachefiles_cache *); static unsigned long cachefiles_open; @@ -78,7 +80,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { /* - * do various checks + * Prepare a cache for caching. */ static int cachefiles_daemon_open(struct inode *inode, struct file *file) { @@ -102,9 +104,10 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) } mutex_init(&cache->daemon_mutex); - cache->active_nodes = RB_ROOT; - rwlock_init(&cache->active_lock); init_waitqueue_head(&cache->daemon_pollwq); + INIT_LIST_HEAD(&cache->volumes); + INIT_LIST_HEAD(&cache->object_list); + spin_lock_init(&cache->object_list_lock); /* set default caching limits * - limit at 1% free space and/or free files @@ -124,7 +127,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) } /* - * release a cache + * Release a cache. */ static int cachefiles_daemon_release(struct inode *inode, struct file *file) { @@ -138,8 +141,6 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file) cachefiles_daemon_unbind(cache); - ASSERT(!cache->active_nodes.rb_node); - /* clean up the control file interface */ cache->cachefilesd = NULL; file->private_data = NULL; @@ -152,7 +153,7 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file) } /* - * read the cache state + * Read the cache state. */ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, size_t buflen, loff_t *pos) @@ -169,7 +170,7 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, return 0; /* check how much space the cache has */ - cachefiles_has_space(cache, 0, 0); + cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check); /* summarise */ f_released = atomic_xchg(&cache->f_released, 0); @@ -206,7 +207,7 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, } /* - * command the cache + * Take a command from cachefilesd, parse it and act on it. */ static ssize_t cachefiles_daemon_write(struct file *file, const char __user *_data, @@ -225,7 +226,7 @@ static ssize_t cachefiles_daemon_write(struct file *file, if (test_bit(CACHEFILES_DEAD, &cache->flags)) return -EIO; - if (datalen < 0 || datalen > PAGE_SIZE - 1) + if (datalen > PAGE_SIZE - 1) return -EOPNOTSUPP; /* drag the command string into the kernel so we can parse it */ @@ -284,7 +285,7 @@ found_command: } /* - * poll for culling state + * Poll for culling state * - use EPOLLOUT to indicate culling state */ static __poll_t cachefiles_daemon_poll(struct file *file, @@ -306,7 +307,7 @@ static __poll_t cachefiles_daemon_poll(struct file *file, } /* - * give a range error for cache space constraints + * Give a range error for cache space constraints * - can be tail-called */ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, @@ -318,7 +319,7 @@ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, } /* - * set the percentage of files at which to stop culling + * Set the percentage of files at which to stop culling * - command: "frun <N>%" */ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) @@ -342,7 +343,7 @@ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of files at which to start culling + * Set the percentage of files at which to start culling * - command: "fcull <N>%" */ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) @@ -366,7 +367,7 @@ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of files at which to stop allocating + * Set the percentage of files at which to stop allocating * - command: "fstop <N>%" */ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) @@ -382,7 +383,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) if (args[0] != '%' || args[1] != '\0') return -EINVAL; - if (fstop < 0 || fstop >= cache->fcull_percent) + if (fstop >= cache->fcull_percent) return cachefiles_daemon_range_error(cache, args); cache->fstop_percent = fstop; @@ -390,7 +391,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of blocks at which to stop culling + * Set the percentage of blocks at which to stop culling * - command: "brun <N>%" */ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) @@ -414,7 +415,7 @@ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of blocks at which to start culling + * Set the percentage of blocks at which to start culling * - command: "bcull <N>%" */ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) @@ -438,7 +439,7 @@ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) } /* - * set the percentage of blocks at which to stop allocating + * Set the percentage of blocks at which to stop allocating * - command: "bstop <N>%" */ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) @@ -454,7 +455,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) if (args[0] != '%' || args[1] != '\0') return -EINVAL; - if (bstop < 0 || bstop >= cache->bcull_percent) + if (bstop >= cache->bcull_percent) return cachefiles_daemon_range_error(cache, args); cache->bstop_percent = bstop; @@ -462,7 +463,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) } /* - * set the cache directory + * Set the cache directory * - command: "dir <name>" */ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) @@ -490,7 +491,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) } /* - * set the cache security context + * Set the cache security context * - command: "secctx <ctx>" */ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) @@ -518,7 +519,7 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) } /* - * set the cache tag + * Set the cache tag * - command: "tag <name>" */ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) @@ -544,7 +545,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) } /* - * request a node in the cache be culled from the current working directory + * Request a node in the cache be culled from the current working directory * - command: "cull <name>" */ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) @@ -568,7 +569,6 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) return -EIO; } - /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); if (!d_can_lookup(path.dentry)) @@ -593,7 +593,7 @@ inval: } /* - * set debugging mode + * Set debugging mode * - command: "debug <mask>" */ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) @@ -616,7 +616,7 @@ inval: } /* - * find out whether an object in the current working directory is in use or not + * Find out whether an object in the current working directory is in use or not * - command: "inuse <name>" */ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) @@ -640,7 +640,6 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) return -EIO; } - /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); if (!d_can_lookup(path.dentry)) @@ -665,84 +664,76 @@ inval: } /* - * see if we have space for a number of pages and/or a number of files in the - * cache + * Bind a directory as a cache */ -int cachefiles_has_space(struct cachefiles_cache *cache, - unsigned fnr, unsigned bnr) +static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) { - struct kstatfs stats; - struct path path = { - .mnt = cache->mnt, - .dentry = cache->mnt->mnt_root, - }; - int ret; + _enter("{%u,%u,%u,%u,%u,%u},%s", + cache->frun_percent, + cache->fcull_percent, + cache->fstop_percent, + cache->brun_percent, + cache->bcull_percent, + cache->bstop_percent, + args); + + if (cache->fstop_percent >= cache->fcull_percent || + cache->fcull_percent >= cache->frun_percent || + cache->frun_percent >= 100) + return -ERANGE; + + if (cache->bstop_percent >= cache->bcull_percent || + cache->bcull_percent >= cache->brun_percent || + cache->brun_percent >= 100) + return -ERANGE; - //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", - // (unsigned long long) cache->frun, - // (unsigned long long) cache->fcull, - // (unsigned long long) cache->fstop, - // (unsigned long long) cache->brun, - // (unsigned long long) cache->bcull, - // (unsigned long long) cache->bstop, - // fnr, bnr); - - /* find out how many pages of blockdev are available */ - memset(&stats, 0, sizeof(stats)); - - ret = vfs_statfs(&path, &stats); - if (ret < 0) { - if (ret == -EIO) - cachefiles_io_error(cache, "statfs failed"); - _leave(" = %d", ret); - return ret; + if (*args) { + pr_err("'bind' command doesn't take an argument\n"); + return -EINVAL; } - stats.f_bavail >>= cache->bshift; - - //_debug("avail %llu,%llu", - // (unsigned long long) stats.f_ffree, - // (unsigned long long) stats.f_bavail); - - /* see if there is sufficient space */ - if (stats.f_ffree > fnr) - stats.f_ffree -= fnr; - else - stats.f_ffree = 0; - - if (stats.f_bavail > bnr) - stats.f_bavail -= bnr; - else - stats.f_bavail = 0; - - ret = -ENOBUFS; - if (stats.f_ffree < cache->fstop || - stats.f_bavail < cache->bstop) - goto begin_cull; - - ret = 0; - if (stats.f_ffree < cache->fcull || - stats.f_bavail < cache->bcull) - goto begin_cull; - - if (test_bit(CACHEFILES_CULLING, &cache->flags) && - stats.f_ffree >= cache->frun && - stats.f_bavail >= cache->brun && - test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) - ) { - _debug("cease culling"); - cachefiles_state_changed(cache); + if (!cache->rootdirname) { + pr_err("No cache directory specified\n"); + return -EINVAL; } - //_leave(" = 0"); - return 0; + /* Don't permit already bound caches to be re-bound */ + if (test_bit(CACHEFILES_READY, &cache->flags)) { + pr_err("Cache already bound\n"); + return -EBUSY; + } -begin_cull: - if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { - _debug("### CULL CACHE ###"); - cachefiles_state_changed(cache); + /* Make sure we have copies of the tag string */ + if (!cache->tag) { + /* + * The tag string is released by the fops->release() + * function, so we don't release it on error here + */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; } - _leave(" = %d", ret); - return ret; + return cachefiles_add_cache(cache); +} + +/* + * Unbind a cache. + */ +static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) +{ + _enter(""); + + if (test_bit(CACHEFILES_READY, &cache->flags)) + cachefiles_withdraw_cache(cache); + + cachefiles_put_directory(cache->graveyard); + cachefiles_put_directory(cache->store); + mntput(cache->mnt); + + kfree(cache->rootdirname); + kfree(cache->secctx); + kfree(cache->tag); + + _leave(""); } diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c new file mode 100644 index 000000000000..58f8aec964e4 --- /dev/null +++ b/fs/cachefiles/error_inject.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Error injection handling. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/sysctl.h> +#include "internal.h" + +unsigned int cachefiles_error_injection_state; + +static struct ctl_table_header *cachefiles_sysctl; +static struct ctl_table cachefiles_sysctls[] = { + { + .procname = "error_injection", + .data = &cachefiles_error_injection_state, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + {} +}; + +static struct ctl_table cachefiles_sysctls_root[] = { + { + .procname = "cachefiles", + .mode = 0555, + .child = cachefiles_sysctls, + }, + {} +}; + +int __init cachefiles_register_error_injection(void) +{ + cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root); + if (!cachefiles_sysctl) + return -ENOMEM; + return 0; + +} + +void cachefiles_unregister_error_injection(void) +{ + unregister_sysctl_table(cachefiles_sysctl); +} diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index da28ac1fa225..51c968cd00a6 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -1,572 +1,445 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache interface to CacheFiles * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/mount.h> +#include <linux/xattr.h> +#include <linux/file.h> +#include <linux/falloc.h> +#include <trace/events/fscache.h> #include "internal.h" -struct cachefiles_lookup_data { - struct cachefiles_xattr *auxdata; /* auxiliary data */ - char *key; /* key path */ -}; - -static int cachefiles_attr_changed(struct fscache_object *_object); +static atomic_t cachefiles_object_debug_id; /* - * allocate an object record for a cookie lookup and prepare the lookup data + * Allocate a cache object record. */ -static struct fscache_object *cachefiles_alloc_object( - struct fscache_cache *_cache, - struct fscache_cookie *cookie) +static +struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie) { - struct cachefiles_lookup_data *lookup_data; + struct fscache_volume *vcookie = cookie->volume; + struct cachefiles_volume *volume = vcookie->cache_priv; struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct cachefiles_xattr *auxdata; - unsigned keylen, auxlen; - void *buffer, *p; - char *key; - cache = container_of(_cache, struct cachefiles_cache, cache); + _enter("{%s},%x,", vcookie->key, cookie->debug_id); - _enter("{%s},%x,", cache->cache.identifier, cookie->debug_id); - - lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp); - if (!lookup_data) - goto nomem_lookup_data; - - /* create a new object record and a temporary leaf image */ - object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp); + object = kmem_cache_zalloc(cachefiles_object_jar, GFP_KERNEL); if (!object) - goto nomem_object; - - ASSERTCMP(object->backer, ==, NULL); + return NULL; - BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); - atomic_set(&object->usage, 1); + refcount_set(&object->ref, 1); - fscache_object_init(&object->fscache, cookie, &cache->cache); + spin_lock_init(&object->lock); + INIT_LIST_HEAD(&object->cache_link); + object->volume = volume; + object->debug_id = atomic_inc_return(&cachefiles_object_debug_id); + object->cookie = fscache_get_cookie(cookie, fscache_cookie_get_attach_object); - object->type = cookie->def->type; - - /* get hold of the raw key - * - stick the length on the front and leave space on the back for the - * encoder - */ - buffer = kmalloc((2 + 512) + 3, cachefiles_gfp); - if (!buffer) - goto nomem_buffer; - - keylen = cookie->key_len; - if (keylen <= sizeof(cookie->inline_key)) - p = cookie->inline_key; - else - p = cookie->key; - memcpy(buffer + 2, p, keylen); - - *(uint16_t *)buffer = keylen; - ((char *)buffer)[keylen + 2] = 0; - ((char *)buffer)[keylen + 3] = 0; - ((char *)buffer)[keylen + 4] = 0; - - /* turn the raw key into something that can work with as a filename */ - key = cachefiles_cook_key(buffer, keylen + 2, object->type); - if (!key) - goto nomem_key; - - /* get hold of the auxiliary data and prepend the object type */ - auxdata = buffer; - auxlen = cookie->aux_len; - if (auxlen) { - if (auxlen <= sizeof(cookie->inline_aux)) - p = cookie->inline_aux; - else - p = cookie->aux; - memcpy(auxdata->data, p, auxlen); - } - - auxdata->len = auxlen + 1; - auxdata->type = cookie->type; - - lookup_data->auxdata = auxdata; - lookup_data->key = key; - object->lookup_data = lookup_data; - - _leave(" = %x [%p]", object->fscache.debug_id, lookup_data); - return &object->fscache; - -nomem_key: - kfree(buffer); -nomem_buffer: - BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); - kmem_cache_free(cachefiles_object_jar, object); - fscache_object_destroyed(&cache->cache); -nomem_object: - kfree(lookup_data); -nomem_lookup_data: - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + fscache_count_object(vcookie->cache); + trace_cachefiles_ref(object->debug_id, cookie->debug_id, 1, + cachefiles_obj_new); + return object; } /* - * attempt to look up the nominated node in this cache - * - return -ETIMEDOUT to be scheduled again + * Note that an object has been seen. */ -static int cachefiles_lookup_object(struct fscache_object *_object) +void cachefiles_see_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why) { - struct cachefiles_lookup_data *lookup_data; - struct cachefiles_object *parent, *object; - struct cachefiles_cache *cache; - const struct cred *saved_cred; - int ret; - - _enter("{OBJ%x}", _object->debug_id); - - cache = container_of(_object->cache, struct cachefiles_cache, cache); - parent = container_of(_object->parent, - struct cachefiles_object, fscache); - object = container_of(_object, struct cachefiles_object, fscache); - lookup_data = object->lookup_data; - - ASSERTCMP(lookup_data, !=, NULL); - - /* look up the key, creating any missing bits */ - cachefiles_begin_secure(cache, &saved_cred); - ret = cachefiles_walk_to_object(parent, object, - lookup_data->key, - lookup_data->auxdata); - cachefiles_end_secure(cache, saved_cred); - - /* polish off by setting the attributes of non-index files */ - if (ret == 0 && - object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) - cachefiles_attr_changed(&object->fscache); - - if (ret < 0 && ret != -ETIMEDOUT) { - if (ret != -ENOBUFS) - pr_warn("Lookup failed error %d\n", ret); - fscache_object_lookup_error(&object->fscache); - } - - _leave(" [%d]", ret); - return ret; + trace_cachefiles_ref(object->debug_id, object->cookie->debug_id, + refcount_read(&object->ref), why); } /* - * indication of lookup completion + * Increment the usage count on an object; */ -static void cachefiles_lookup_complete(struct fscache_object *_object) +struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why) { - struct cachefiles_object *object; - - object = container_of(_object, struct cachefiles_object, fscache); - - _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data); + int r; - if (object->lookup_data) { - kfree(object->lookup_data->key); - kfree(object->lookup_data->auxdata); - kfree(object->lookup_data); - object->lookup_data = NULL; - } + __refcount_inc(&object->ref, &r); + trace_cachefiles_ref(object->debug_id, object->cookie->debug_id, r, why); + return object; } /* - * increment the usage count on an inode object (may fail if unmounting) + * dispose of a reference to an object */ -static -struct fscache_object *cachefiles_grab_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +void cachefiles_put_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why) { - struct cachefiles_object *object = - container_of(_object, struct cachefiles_object, fscache); - int u; + unsigned int object_debug_id = object->debug_id; + unsigned int cookie_debug_id = object->cookie->debug_id; + struct fscache_cache *cache; + bool done; + int r; + + done = __refcount_dec_and_test(&object->ref, &r); + trace_cachefiles_ref(object_debug_id, cookie_debug_id, r, why); + if (done) { + _debug("- kill object OBJ%x", object_debug_id); + + ASSERTCMP(object->file, ==, NULL); - _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); + kfree(object->d_name); -#ifdef CACHEFILES_DEBUG_SLAB - ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); -#endif + cache = object->volume->cache->cache; + fscache_put_cookie(object->cookie, fscache_cookie_put_object); + object->cookie = NULL; + kmem_cache_free(cachefiles_object_jar, object); + fscache_uncount_object(cache); + } - u = atomic_inc_return(&object->usage); - trace_cachefiles_ref(object, _object->cookie, - (enum cachefiles_obj_ref_trace)why, u); - return &object->fscache; + _leave(""); } /* - * update the auxiliary data for an object object on disk + * Adjust the size of a cache file if necessary to match the DIO size. We keep + * the EOF marker a multiple of DIO blocks so that we don't fall back to doing + * non-DIO for a partial block straddling the EOF, but we also have to be + * careful of someone expanding the file and accidentally accreting the + * padding. */ -static void cachefiles_update_object(struct fscache_object *_object) +static int cachefiles_adjust_size(struct cachefiles_object *object) { - struct cachefiles_object *object; - struct cachefiles_xattr *auxdata; - struct cachefiles_cache *cache; - struct fscache_cookie *cookie; - const struct cred *saved_cred; - const void *aux; - unsigned auxlen; + struct iattr newattrs; + struct file *file = object->file; + uint64_t ni_size; + loff_t oi_size; + int ret; - _enter("{OBJ%x}", _object->debug_id); + ni_size = object->cookie->object_size; + ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); - object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, struct cachefiles_cache, - cache); + _enter("{OBJ%x},[%llu]", + object->debug_id, (unsigned long long) ni_size); - if (!fscache_use_cookie(_object)) { - _leave(" [relinq]"); - return; - } + if (!file) + return -ENOBUFS; - cookie = object->fscache.cookie; - auxlen = cookie->aux_len; + oi_size = i_size_read(file_inode(file)); + if (oi_size == ni_size) + return 0; - if (!auxlen) { - fscache_unuse_cookie(_object); - _leave(" [no aux]"); - return; - } + inode_lock(file_inode(file)); - auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp); - if (!auxdata) { - fscache_unuse_cookie(_object); - _leave(" [nomem]"); - return; + /* if there's an extension to a partial page at the end of the backing + * file, we need to discard the partial page so that we pick up new + * data after it */ + if (oi_size & ~PAGE_MASK && ni_size > oi_size) { + _debug("discard tail %llx", oi_size); + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = oi_size & PAGE_MASK; + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = notify_change(&init_user_ns, file->f_path.dentry, + &newattrs, NULL); + if (ret < 0) + goto truncate_failed; } - aux = (auxlen <= sizeof(cookie->inline_aux)) ? - cookie->inline_aux : cookie->aux; + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = ni_size; + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = notify_change(&init_user_ns, file->f_path.dentry, + &newattrs, NULL); - memcpy(auxdata->data, aux, auxlen); - fscache_unuse_cookie(_object); +truncate_failed: + inode_unlock(file_inode(file)); - auxdata->len = auxlen + 1; - auxdata->type = cookie->type; + if (ret < 0) + trace_cachefiles_io_error(NULL, file_inode(file), ret, + cachefiles_trace_notify_change_error); + if (ret == -EIO) { + cachefiles_io_error_obj(object, "Size set failed"); + ret = -ENOBUFS; + } - cachefiles_begin_secure(cache, &saved_cred); - cachefiles_update_object_xattr(object, auxdata); - cachefiles_end_secure(cache, saved_cred); - kfree(auxdata); - _leave(""); + _leave(" = %d", ret); + return ret; } /* - * discard the resources pinned by an object and effect retirement if - * requested + * Attempt to look up the nominated node in this cache */ -static void cachefiles_drop_object(struct fscache_object *_object) +static bool cachefiles_lookup_cookie(struct fscache_cookie *cookie) { struct cachefiles_object *object; - struct cachefiles_cache *cache; + struct cachefiles_cache *cache = cookie->volume->cache->cache_priv; const struct cred *saved_cred; - struct inode *inode; - blkcnt_t i_blocks = 0; + bool success; - ASSERT(_object); + object = cachefiles_alloc_object(cookie); + if (!object) + goto fail; - object = container_of(_object, struct cachefiles_object, fscache); + _enter("{OBJ%x}", object->debug_id); - _enter("{OBJ%x,%d}", - object->fscache.debug_id, atomic_read(&object->usage)); + if (!cachefiles_cook_key(object)) + goto fail_put; - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + cookie->cache_priv = object; -#ifdef CACHEFILES_DEBUG_SLAB - ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); -#endif + cachefiles_begin_secure(cache, &saved_cred); - /* We need to tidy the object up if we did in fact manage to open it. - * It's possible for us to get here before the object is fully - * initialised if the parent goes away or the object gets retired - * before we set it up. - */ - if (object->dentry) { - /* delete retired objects */ - if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && - _object != cache->cache.fsdef - ) { - _debug("- retire object OBJ%x", object->fscache.debug_id); - inode = d_backing_inode(object->dentry); - if (inode) - i_blocks = inode->i_blocks; - - cachefiles_begin_secure(cache, &saved_cred); - cachefiles_delete_object(cache, object); - cachefiles_end_secure(cache, saved_cred); - } + success = cachefiles_look_up_object(object); + if (!success) + goto fail_withdraw; - /* close the filesystem stuff attached to the object */ - if (object->backer != object->dentry) - dput(object->backer); - object->backer = NULL; - } + cachefiles_see_object(object, cachefiles_obj_see_lookup_cookie); + + spin_lock(&cache->object_list_lock); + list_add(&object->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + cachefiles_adjust_size(object); - /* note that the object is now inactive */ - if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) - cachefiles_mark_object_inactive(cache, object, i_blocks); + cachefiles_end_secure(cache, saved_cred); + _leave(" = t"); + return true; - dput(object->dentry); - object->dentry = NULL; +fail_withdraw: + cachefiles_end_secure(cache, saved_cred); + cachefiles_see_object(object, cachefiles_obj_see_lookup_failed); + fscache_caching_failed(cookie); + _debug("failed c=%08x o=%08x", cookie->debug_id, object->debug_id); + /* The caller holds an access count on the cookie, so we need them to + * drop it before we can withdraw the object. + */ + return false; - _leave(""); +fail_put: + cachefiles_put_object(object, cachefiles_obj_put_alloc_fail); +fail: + return false; } /* - * dispose of a reference to an object + * Shorten the backing object to discard any dirty data and free up + * any unused granules. */ -void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +static bool cachefiles_shorten_object(struct cachefiles_object *object, + struct file *file, loff_t new_size) { - struct cachefiles_object *object; - struct fscache_cache *cache; - int u; - - ASSERT(_object); - - object = container_of(_object, struct cachefiles_object, fscache); - - _enter("{OBJ%x,%d}", - object->fscache.debug_id, atomic_read(&object->usage)); - -#ifdef CACHEFILES_DEBUG_SLAB - ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); -#endif - - ASSERTIFCMP(object->fscache.parent, - object->fscache.parent->n_children, >, 0); - - u = atomic_dec_return(&object->usage); - trace_cachefiles_ref(object, _object->cookie, - (enum cachefiles_obj_ref_trace)why, u); - ASSERTCMP(u, !=, -1); - if (u == 0) { - _debug("- kill object OBJ%x", object->fscache.debug_id); + struct cachefiles_cache *cache = object->volume->cache; + struct inode *inode = file_inode(file); + loff_t i_size, dio_size; + int ret; - ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); - ASSERTCMP(object->fscache.parent, ==, NULL); - ASSERTCMP(object->backer, ==, NULL); - ASSERTCMP(object->dentry, ==, NULL); - ASSERTCMP(object->fscache.n_ops, ==, 0); - ASSERTCMP(object->fscache.n_children, ==, 0); + dio_size = round_up(new_size, CACHEFILES_DIO_BLOCK_SIZE); + i_size = i_size_read(inode); + + trace_cachefiles_trunc(object, inode, i_size, dio_size, + cachefiles_trunc_shrink); + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = vfs_truncate(&file->f_path, dio_size); + if (ret < 0) { + trace_cachefiles_io_error(object, file_inode(file), ret, + cachefiles_trace_trunc_error); + cachefiles_io_error_obj(object, "Trunc-to-size failed %d", ret); + cachefiles_remove_object_xattr(cache, object, file->f_path.dentry); + return false; + } - if (object->lookup_data) { - kfree(object->lookup_data->key); - kfree(object->lookup_data->auxdata); - kfree(object->lookup_data); - object->lookup_data = NULL; + if (new_size < dio_size) { + trace_cachefiles_trunc(object, inode, dio_size, new_size, + cachefiles_trunc_dio_adjust); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE, + new_size, dio_size); + if (ret < 0) { + trace_cachefiles_io_error(object, file_inode(file), ret, + cachefiles_trace_fallocate_error); + cachefiles_io_error_obj(object, "Trunc-to-dio-size failed %d", ret); + cachefiles_remove_object_xattr(cache, object, file->f_path.dentry); + return false; } - - cache = object->fscache.cache; - fscache_object_destroy(&object->fscache); - kmem_cache_free(cachefiles_object_jar, object); - fscache_object_destroyed(cache); } - _leave(""); + return true; } /* - * sync a cache + * Resize the backing object. */ -static void cachefiles_sync_cache(struct fscache_cache *_cache) +static void cachefiles_resize_cookie(struct netfs_cache_resources *cres, + loff_t new_size) { - struct cachefiles_cache *cache; + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; + struct fscache_cookie *cookie = object->cookie; const struct cred *saved_cred; - int ret; + struct file *file = cachefiles_cres_file(cres); + loff_t old_size = cookie->object_size; - _enter("%s", _cache->tag->name); + _enter("%llu->%llu", old_size, new_size); - cache = container_of(_cache, struct cachefiles_cache, cache); - - /* make sure all pages pinned by operations on behalf of the netfs are - * written to disc */ - cachefiles_begin_secure(cache, &saved_cred); - down_read(&cache->mnt->mnt_sb->s_umount); - ret = sync_filesystem(cache->mnt->mnt_sb); - up_read(&cache->mnt->mnt_sb->s_umount); - cachefiles_end_secure(cache, saved_cred); + if (new_size < old_size) { + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_shorten_object(object, file, new_size); + cachefiles_end_secure(cache, saved_cred); + object->cookie->object_size = new_size; + return; + } - if (ret == -EIO) - cachefiles_io_error(cache, - "Attempt to sync backing fs superblock" - " returned error %d", - ret); + /* The file is being expanded. We don't need to do anything + * particularly. cookie->initial_size doesn't change and so the point + * at which we have to download before doesn't change. + */ + cookie->object_size = new_size; } /* - * check if the backing cache is updated to FS-Cache - * - called by FS-Cache when evaluates if need to invalidate the cache + * Commit changes to the object as we drop it. */ -static int cachefiles_check_consistency(struct fscache_operation *op) +static void cachefiles_commit_object(struct cachefiles_object *object, + struct cachefiles_cache *cache) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; - const struct cred *saved_cred; - int ret; + bool update = false; - _enter("{OBJ%x}", op->object->debug_id); + if (test_and_clear_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags)) + update = true; + if (test_and_clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags)) + update = true; + if (update) + cachefiles_set_object_xattr(object); - object = container_of(op->object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + if (test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) + cachefiles_commit_tmpfile(cache, object); +} - cachefiles_begin_secure(cache, &saved_cred); - ret = cachefiles_check_auxdata(object); - cachefiles_end_secure(cache, saved_cred); +/* + * Finalise and object and close the VFS structs that we have. + */ +static void cachefiles_clean_up_object(struct cachefiles_object *object, + struct cachefiles_cache *cache) +{ + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) { + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + cachefiles_see_object(object, cachefiles_obj_see_clean_delete); + _debug("- inval object OBJ%x", object->debug_id); + cachefiles_delete_object(object, FSCACHE_OBJECT_WAS_RETIRED); + } else { + cachefiles_see_object(object, cachefiles_obj_see_clean_drop_tmp); + _debug("- inval object OBJ%x tmpfile", object->debug_id); + } + } else { + cachefiles_see_object(object, cachefiles_obj_see_clean_commit); + cachefiles_commit_object(object, cache); + } - _leave(" = %d", ret); - return ret; + cachefiles_unmark_inode_in_use(object, object->file); + if (object->file) { + fput(object->file); + object->file = NULL; + } } /* - * notification the attributes on an object have changed - * - called with reads/writes excluded by FS-Cache + * Withdraw caching for a cookie. */ -static int cachefiles_attr_changed(struct fscache_object *_object) +static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; + struct cachefiles_object *object = cookie->cache_priv; + struct cachefiles_cache *cache = object->volume->cache; const struct cred *saved_cred; - struct iattr newattrs; - uint64_t ni_size; - loff_t oi_size; - int ret; - - ni_size = _object->store_limit_l; - - _enter("{OBJ%x},[%llu]", - _object->debug_id, (unsigned long long) ni_size); - - object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - if (ni_size == object->i_size) - return 0; - - if (!object->backer) - return -ENOBUFS; - ASSERT(d_is_reg(object->backer)); + _enter("o=%x", object->debug_id); + cachefiles_see_object(object, cachefiles_obj_see_withdraw_cookie); - fscache_set_store_limit(&object->fscache, ni_size); - - oi_size = i_size_read(d_backing_inode(object->backer)); - if (oi_size == ni_size) - return 0; - - cachefiles_begin_secure(cache, &saved_cred); - inode_lock(d_inode(object->backer)); - - /* if there's an extension to a partial page at the end of the backing - * file, we need to discard the partial page so that we pick up new - * data after it */ - if (oi_size & ~PAGE_MASK && ni_size > oi_size) { - _debug("discard tail %llx", oi_size); - newattrs.ia_valid = ATTR_SIZE; - newattrs.ia_size = oi_size & PAGE_MASK; - ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); - if (ret < 0) - goto truncate_failed; + if (!list_empty(&object->cache_link)) { + spin_lock(&cache->object_list_lock); + cachefiles_see_object(object, cachefiles_obj_see_withdrawal); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); } - newattrs.ia_valid = ATTR_SIZE; - newattrs.ia_size = ni_size; - ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); - -truncate_failed: - inode_unlock(d_inode(object->backer)); - cachefiles_end_secure(cache, saved_cred); - - if (ret == -EIO) { - fscache_set_store_limit(&object->fscache, 0); - cachefiles_io_error_obj(object, "Size set failed"); - ret = -ENOBUFS; + if (object->file) { + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_clean_up_object(object, cache); + cachefiles_end_secure(cache, saved_cred); } - _leave(" = %d", ret); - return ret; + cookie->cache_priv = NULL; + cachefiles_put_object(object, cachefiles_obj_put_detach); } /* - * Invalidate an object + * Invalidate the storage associated with a cookie. */ -static void cachefiles_invalidate_object(struct fscache_operation *op) +static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; - const struct cred *saved_cred; - struct path path; - uint64_t ni_size; - int ret; + struct cachefiles_object *object = cookie->cache_priv; + struct file *new_file, *old_file; + bool old_tmpfile; - object = container_of(op->object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + _enter("o=%x,[%llu]", object->debug_id, object->cookie->object_size); - ni_size = op->object->store_limit_l; + old_tmpfile = test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); - _enter("{OBJ%x},[%llu]", - op->object->debug_id, (unsigned long long)ni_size); + if (!object->file) { + fscache_resume_after_invalidation(cookie); + _leave(" = t [light]"); + return true; + } - if (object->backer) { - ASSERT(d_is_reg(object->backer)); + new_file = cachefiles_create_tmpfile(object); + if (IS_ERR(new_file)) + goto failed; - fscache_set_store_limit(&object->fscache, ni_size); + /* Substitute the VFS target */ + _debug("sub"); + spin_lock(&object->lock); - path.dentry = object->backer; - path.mnt = cache->mnt; + old_file = object->file; + object->file = new_file; + object->content_info = CACHEFILES_CONTENT_NO_DATA; + set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); - cachefiles_begin_secure(cache, &saved_cred); - ret = vfs_truncate(&path, 0); - if (ret == 0) - ret = vfs_truncate(&path, ni_size); - cachefiles_end_secure(cache, saved_cred); + spin_unlock(&object->lock); + _debug("subbed"); + + /* Allow I/O to take place again */ + fscache_resume_after_invalidation(cookie); + + if (old_file) { + if (!old_tmpfile) { + struct cachefiles_volume *volume = object->volume; + struct dentry *fan = volume->fanout[(u8)cookie->key_hash]; - if (ret != 0) { - fscache_set_store_limit(&object->fscache, 0); - if (ret == -EIO) - cachefiles_io_error_obj(object, - "Invalidate failed"); + inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); + cachefiles_bury_object(volume->cache, object, fan, + old_file->f_path.dentry, + FSCACHE_OBJECT_INVALIDATED); } + fput(old_file); } - fscache_op_complete(op, true); - _leave(""); -} + _leave(" = t"); + return true; -/* - * dissociate a cache from all the pages it was backing - */ -static void cachefiles_dissociate_pages(struct fscache_cache *cache) -{ - _enter(""); +failed: + _leave(" = f"); + return false; } const struct fscache_cache_ops cachefiles_cache_ops = { .name = "cachefiles", - .alloc_object = cachefiles_alloc_object, - .lookup_object = cachefiles_lookup_object, - .lookup_complete = cachefiles_lookup_complete, - .grab_object = cachefiles_grab_object, - .update_object = cachefiles_update_object, - .invalidate_object = cachefiles_invalidate_object, - .drop_object = cachefiles_drop_object, - .put_object = cachefiles_put_object, - .sync_cache = cachefiles_sync_cache, - .attr_changed = cachefiles_attr_changed, - .read_or_alloc_page = cachefiles_read_or_alloc_page, - .read_or_alloc_pages = cachefiles_read_or_alloc_pages, - .allocate_page = cachefiles_allocate_page, - .allocate_pages = cachefiles_allocate_pages, - .write_page = cachefiles_write_page, - .uncache_page = cachefiles_uncache_page, - .dissociate_pages = cachefiles_dissociate_pages, - .check_consistency = cachefiles_check_consistency, - .begin_read_operation = cachefiles_begin_read_operation, + .acquire_volume = cachefiles_acquire_volume, + .free_volume = cachefiles_free_volume, + .lookup_cookie = cachefiles_lookup_cookie, + .withdraw_cookie = cachefiles_withdraw_cookie, + .invalidate_cookie = cachefiles_invalidate_cookie, + .begin_operation = cachefiles_begin_operation, + .resize_cookie = cachefiles_resize_cookie, + .prepare_to_write = cachefiles_prepare_to_write, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 0a511c36dab8..c793d33b0224 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* General netfs cache on cache files internal defs * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -13,58 +13,72 @@ #include <linux/fscache-cache.h> -#include <linux/timer.h> -#include <linux/wait_bit.h> #include <linux/cred.h> -#include <linux/workqueue.h> #include <linux/security.h> +#define CACHEFILES_DIO_BLOCK_SIZE 4096 + struct cachefiles_cache; struct cachefiles_object; -extern unsigned cachefiles_debug; -#define CACHEFILES_DEBUG_KENTER 1 -#define CACHEFILES_DEBUG_KLEAVE 2 -#define CACHEFILES_DEBUG_KDEBUG 4 +enum cachefiles_content { + /* These values are saved on disk */ + CACHEFILES_CONTENT_NO_DATA = 0, /* No content stored */ + CACHEFILES_CONTENT_SINGLE = 1, /* Content is monolithic, all is present */ + CACHEFILES_CONTENT_ALL = 2, /* Content is all present, no map */ + CACHEFILES_CONTENT_BACKFS_MAP = 3, /* Content is piecemeal, mapped through backing fs */ + CACHEFILES_CONTENT_DIRTY = 4, /* Content is dirty (only seen on disk) */ + nr__cachefiles_content +}; -#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC) +/* + * Cached volume representation. + */ +struct cachefiles_volume { + struct cachefiles_cache *cache; + struct list_head cache_link; /* Link in cache->volumes */ + struct fscache_volume *vcookie; /* The netfs's representation */ + struct dentry *dentry; /* The volume dentry */ + struct dentry *fanout[256]; /* Fanout subdirs */ +}; /* - * node records + * Backing file state. */ struct cachefiles_object { - struct fscache_object fscache; /* fscache handle */ - struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ - struct dentry *dentry; /* the file/dir representing this object */ - struct dentry *backer; /* backing file */ - loff_t i_size; /* object size */ + struct fscache_cookie *cookie; /* Netfs data storage object cookie */ + struct cachefiles_volume *volume; /* Cache volume that holds this object */ + struct list_head cache_link; /* Link in cache->*_list */ + struct file *file; /* The file representing this object */ + char *d_name; /* Backing file name */ + int debug_id; + spinlock_t lock; + refcount_t ref; + u8 d_name_len; /* Length of filename */ + enum cachefiles_content content_info:8; /* Info about content presence */ unsigned long flags; -#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ - atomic_t usage; /* object usage count */ - uint8_t type; /* object type */ - uint8_t new; /* T if object new */ - spinlock_t work_lock; - struct rb_node active_node; /* link in active tree (dentry is key) */ +#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ }; -extern struct kmem_cache *cachefiles_object_jar; - /* * Cache files cache definition */ struct cachefiles_cache { - struct fscache_cache cache; /* FS-Cache record */ + struct fscache_cache *cache; /* Cache cookie */ struct vfsmount *mnt; /* mountpoint holding the cache */ + struct dentry *store; /* Directory into which live objects go */ struct dentry *graveyard; /* directory into which dead objects go */ struct file *cachefilesd; /* manager daemon handle */ + struct list_head volumes; /* List of volume objects */ + struct list_head object_list; /* List of active objects */ + spinlock_t object_list_lock; /* Lock for volumes and object_list */ const struct cred *cache_cred; /* security override for accessing cache */ struct mutex daemon_mutex; /* command serialisation mutex */ wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ - struct rb_root active_nodes; /* active nodes (can't be culled) */ - rwlock_t active_lock; /* lock for active_nodes */ atomic_t gravecounter; /* graveyard uniquifier */ atomic_t f_released; /* number of objects released lately */ atomic_long_t b_released; /* number of blocks released lately */ + atomic_long_t b_writing; /* Number of blocks being written */ unsigned frun_percent; /* when to stop culling (% files) */ unsigned fcull_percent; /* when to start culling (% files) */ unsigned fstop_percent; /* when to stop allocating (% files) */ @@ -72,7 +86,7 @@ struct cachefiles_cache { unsigned bcull_percent; /* when to start culling (% blocks) */ unsigned bstop_percent; /* when to stop allocating (% blocks) */ unsigned bsize; /* cache's block size */ - unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + unsigned bshift; /* ilog2(bsize) */ uint64_t frun; /* when to stop culling */ uint64_t fcull; /* when to start culling */ uint64_t fstop; /* when to stop allocating */ @@ -89,38 +103,19 @@ struct cachefiles_cache { char *tag; /* cache binding tag */ }; -/* - * backing file read tracking - */ -struct cachefiles_one_read { - wait_queue_entry_t monitor; /* link into monitored waitqueue */ - struct page *back_page; /* backing file page we're waiting for */ - struct page *netfs_page; /* netfs page we're going to fill */ - struct fscache_retrieval *op; /* retrieval op covering this */ - struct list_head op_link; /* link in op's todo list */ -}; - -/* - * backing file write tracking - */ -struct cachefiles_one_write { - struct page *netfs_page; /* netfs page to copy */ - struct cachefiles_object *object; - struct list_head obj_link; /* link in object's lists */ - fscache_rw_complete_t end_io_func; - void *context; -}; +#include <trace/events/cachefiles.h> -/* - * auxiliary data xattr buffer - */ -struct cachefiles_xattr { - uint16_t len; - uint8_t type; - uint8_t data[]; -}; +static inline +struct file *cachefiles_cres_file(struct netfs_cache_resources *cres) +{ + return cres->cache_priv2; +} -#include <trace/events/cachefiles.h> +static inline +struct cachefiles_object *cachefiles_cres_object(struct netfs_cache_resources *cres) +{ + return fscache_cres_cookie(cres)->cache_priv; +} /* * note change of state for daemon @@ -132,74 +127,118 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache) } /* - * bind.c + * cache.c */ -extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); -extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); +extern int cachefiles_add_cache(struct cachefiles_cache *cache); +extern void cachefiles_withdraw_cache(struct cachefiles_cache *cache); + +enum cachefiles_has_space_for { + cachefiles_has_space_check, + cachefiles_has_space_for_write, + cachefiles_has_space_for_create, +}; +extern int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr, + enum cachefiles_has_space_for reason); /* * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; -extern int cachefiles_has_space(struct cachefiles_cache *cache, - unsigned fnr, unsigned bnr); +/* + * error_inject.c + */ +#ifdef CONFIG_CACHEFILES_ERROR_INJECTION +extern unsigned int cachefiles_error_injection_state; +extern int cachefiles_register_error_injection(void); +extern void cachefiles_unregister_error_injection(void); + +#else +#define cachefiles_error_injection_state 0 + +static inline int cachefiles_register_error_injection(void) +{ + return 0; +} + +static inline void cachefiles_unregister_error_injection(void) +{ +} +#endif + + +static inline int cachefiles_inject_read_error(void) +{ + return cachefiles_error_injection_state & 2 ? -EIO : 0; +} + +static inline int cachefiles_inject_write_error(void) +{ + return cachefiles_error_injection_state & 2 ? -EIO : + cachefiles_error_injection_state & 1 ? -ENOSPC : + 0; +} + +static inline int cachefiles_inject_remove_error(void) +{ + return cachefiles_error_injection_state & 2 ? -EIO : 0; +} /* * interface.c */ extern const struct fscache_cache_ops cachefiles_cache_ops; +extern void cachefiles_see_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why); +extern struct cachefiles_object *cachefiles_grab_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why); +extern void cachefiles_put_object(struct cachefiles_object *object, + enum cachefiles_obj_ref_trace why); -void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why); +/* + * io.c + */ +extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, + enum fscache_want_state want_state); /* * key.c */ -extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); +extern bool cachefiles_cook_key(struct cachefiles_object *object); + +/* + * main.c + */ +extern struct kmem_cache *cachefiles_object_jar; /* * namei.c */ -extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object, - blkcnt_t i_blocks); -extern int cachefiles_delete_object(struct cachefiles_cache *cache, - struct cachefiles_object *object); -extern int cachefiles_walk_to_object(struct cachefiles_object *parent, - struct cachefiles_object *object, - const char *key, - struct cachefiles_xattr *auxdata); +extern void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, + struct file *file); +extern int cachefiles_bury_object(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dir, + struct dentry *rep, + enum fscache_why_object_killed why); +extern int cachefiles_delete_object(struct cachefiles_object *object, + enum fscache_why_object_killed why); +extern bool cachefiles_look_up_object(struct cachefiles_object *object); extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, struct dentry *dir, - const char *name); + const char *name, + bool *_is_new); +extern void cachefiles_put_directory(struct dentry *dir); extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, char *filename); extern int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename); - -/* - * rdwr.c - */ -extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, - struct page *, gfp_t); -extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, - struct list_head *, unsigned *, - gfp_t); -extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, - gfp_t); -extern int cachefiles_allocate_pages(struct fscache_retrieval *, - struct list_head *, unsigned *, gfp_t); -extern int cachefiles_write_page(struct fscache_storage *, struct page *); -extern void cachefiles_uncache_page(struct fscache_object *, struct page *); - -/* - * rdwr2.c - */ -extern int cachefiles_begin_read_operation(struct netfs_read_request *, - struct fscache_retrieval *); +extern struct file *cachefiles_create_tmpfile(struct cachefiles_object *object); +extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, + struct cachefiles_object *object); /* * security.c @@ -222,28 +261,32 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache, } /* + * volume.c + */ +void cachefiles_acquire_volume(struct fscache_volume *volume); +void cachefiles_free_volume(struct fscache_volume *volume); +void cachefiles_withdraw_volume(struct cachefiles_volume *volume); + +/* * xattr.c */ -extern int cachefiles_check_object_type(struct cachefiles_object *object); -extern int cachefiles_set_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata); -extern int cachefiles_update_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata); -extern int cachefiles_check_auxdata(struct cachefiles_object *object); -extern int cachefiles_check_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata); +extern int cachefiles_set_object_xattr(struct cachefiles_object *object); +extern int cachefiles_check_auxdata(struct cachefiles_object *object, + struct file *file); extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct cachefiles_object *object, struct dentry *dentry); - +extern void cachefiles_prepare_to_write(struct fscache_cookie *cookie); +extern bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume); +extern int cachefiles_check_volume_xattr(struct cachefiles_volume *volume); /* - * error handling + * Error handling */ - #define cachefiles_io_error(___cache, FMT, ...) \ do { \ pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ - fscache_io_error(&(___cache)->cache); \ + fscache_io_error((___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ } while (0) @@ -251,15 +294,20 @@ do { \ do { \ struct cachefiles_cache *___cache; \ \ - ___cache = container_of((object)->fscache.cache, \ - struct cachefiles_cache, cache); \ - cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ + ___cache = (object)->volume->cache; \ + cachefiles_io_error(___cache, FMT " [o=%08x]", ##__VA_ARGS__, \ + (object)->debug_id); \ } while (0) /* - * debug tracing + * Debug tracing */ +extern unsigned cachefiles_debug; +#define CACHEFILES_DEBUG_KENTER 1 +#define CACHEFILES_DEBUG_KLEAVE 2 +#define CACHEFILES_DEBUG_KDEBUG 4 + #define dbgprintk(FMT, ...) \ printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index effe37ef8629..753986ea1583 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -9,8 +9,9 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/falloc.h> #include <linux/sched/mm.h> -#include <linux/netfs.h> +#include <trace/events/fscache.h> #include "internal.h" struct cachefiles_kiocb { @@ -21,14 +22,18 @@ struct cachefiles_kiocb { size_t skipped; size_t len; }; + struct cachefiles_object *object; netfs_io_terminated_t term_func; void *term_func_priv; bool was_async; + unsigned int inval_counter; /* Copy of cookie->inval_counter */ + u64 b_writing; }; static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) { if (refcount_dec_and_test(&ki->ki_refcnt)) { + cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq); fput(ki->iocb.ki_filp); kfree(ki); } @@ -40,12 +45,22 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) static void cachefiles_read_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct inode *inode = file_inode(ki->iocb.ki_filp); _enter("%ld", ret); + if (ret < 0) + trace_cachefiles_io_error(ki->object, inode, ret, + cachefiles_trace_read_error); + if (ki->term_func) { - if (ret >= 0) - ret += ki->skipped; + if (ret >= 0) { + if (ki->object->cookie->inval_counter == ki->inval_counter) + ki->skipped += ret; + else + ret = -ESTALE; + } + ki->term_func(ki->term_func_priv, ret, ki->was_async); } @@ -58,16 +73,24 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret) static int cachefiles_read(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, - bool seek_data, + enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv) { + struct cachefiles_object *object; struct cachefiles_kiocb *ki; - struct file *file = cres->cache_priv2; + struct file *file; unsigned int old_nofs; ssize_t ret = -ENOBUFS; size_t len = iov_iter_count(iter), skipped = 0; + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + goto presubmission_error; + + fscache_count_read(); + object = cachefiles_cres_object(cres); + file = cachefiles_cres_file(cres); + _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); @@ -75,10 +98,12 @@ static int cachefiles_read(struct netfs_cache_resources *cres, /* If the caller asked us to seek for data before doing the read, then * we should do that now. If we find a gap, we fill it with zeros. */ - if (seek_data) { + if (read_hole != NETFS_READ_HOLE_IGNORE) { loff_t off = start_pos, off2; - off2 = vfs_llseek(file, off, SEEK_DATA); + off2 = cachefiles_inject_read_error(); + if (off2 == 0) + off2 = vfs_llseek(file, off, SEEK_DATA); if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) { skipped = 0; ret = off2; @@ -90,6 +115,10 @@ static int cachefiles_read(struct netfs_cache_resources *cres, * in the region, so clear the rest of the buffer and * return success. */ + ret = -ENODATA; + if (read_hole == NETFS_READ_HOLE_FAIL) + goto presubmission_error; + iov_iter_zero(len, iter); skipped = len; ret = 0; @@ -100,7 +129,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, iov_iter_zero(skipped, iter); } - ret = -ENOBUFS; + ret = -ENOMEM; ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) goto presubmission_error; @@ -112,6 +141,8 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->skipped = skipped; + ki->object = object; + ki->inval_counter = cres->inval_counter; ki->term_func = term_func; ki->term_func_priv = term_func_priv; ki->was_async = true; @@ -120,9 +151,13 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_complete = cachefiles_read_complete; get_file(ki->iocb.ki_filp); + cachefiles_grab_object(object, cachefiles_obj_get_ioreq); + trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped); old_nofs = memalloc_nofs_save(); - ret = vfs_iocb_iter_read(file, &ki->iocb, iter); + ret = cachefiles_inject_read_error(); + if (ret == 0) + ret = vfs_iocb_iter_read(file, &ki->iocb, iter); memalloc_nofs_restore(old_nofs); switch (ret) { case -EIOCBQUEUED: @@ -157,11 +192,70 @@ presubmission_error: } /* + * Query the occupancy of the cache in a region, returning where the next chunk + * of data starts and how long it is. + */ +static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len) +{ + struct cachefiles_object *object; + struct file *file; + loff_t off, off2; + + *_data_start = -1; + *_data_len = 0; + + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + return -ENOBUFS; + + object = cachefiles_cres_object(cres); + file = cachefiles_cres_file(cres); + granularity = max_t(size_t, object->volume->cache->bsize, granularity); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start, len, + i_size_read(file_inode(file))); + + off = cachefiles_inject_read_error(); + if (off == 0) + off = vfs_llseek(file, start, SEEK_DATA); + if (off == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off < 0 && off >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + if (round_up(off, granularity) >= start + len) + return -ENODATA; /* No data in range */ + + off2 = cachefiles_inject_read_error(); + if (off2 == 0) + off2 = vfs_llseek(file, off, SEEK_HOLE); + if (off2 == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + + /* Round away partial blocks */ + off = round_up(off, granularity); + off2 = round_down(off2, granularity); + if (off2 <= off) + return -ENODATA; + + *_data_start = off; + if (off2 > start + len) + *_data_len = len; + else + *_data_len = off2 - off; + return 0; +} + +/* * Handle completion of a write to the cache. */ static void cachefiles_write_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct cachefiles_object *object = ki->object; struct inode *inode = file_inode(ki->iocb.ki_filp); _enter("%ld", ret); @@ -170,9 +264,14 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + if (ret < 0) + trace_cachefiles_io_error(object, inode, ret, + cachefiles_trace_write_error); + + atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing); + set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags); if (ki->term_func) ki->term_func(ki->term_func_priv, ret, ki->was_async); - cachefiles_put_kiocb(ki); } @@ -185,17 +284,27 @@ static int cachefiles_write(struct netfs_cache_resources *cres, netfs_io_terminated_t term_func, void *term_func_priv) { + struct cachefiles_object *object; + struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; struct inode *inode; - struct file *file = cres->cache_priv2; + struct file *file; unsigned int old_nofs; ssize_t ret = -ENOBUFS; size_t len = iov_iter_count(iter); + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) + goto presubmission_error; + fscache_count_write(); + object = cachefiles_cres_object(cres); + cache = object->volume->cache; + file = cachefiles_cres_file(cres); + _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); + ret = -ENOMEM; ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) goto presubmission_error; @@ -206,14 +315,18 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); + ki->object = object; + ki->inval_counter = cres->inval_counter; ki->start = start_pos; ki->len = len; ki->term_func = term_func; ki->term_func_priv = term_func_priv; ki->was_async = true; + ki->b_writing = (len + (1 << cache->bshift) - 1) >> cache->bshift; if (ki->term_func) ki->iocb.ki_complete = cachefiles_write_complete; + atomic_long_add(ki->b_writing, &cache->b_writing); /* Open-code file_start_write here to grab freeze protection, which * will be released by another thread in aio_complete_rw(). Fool @@ -225,9 +338,13 @@ static int cachefiles_write(struct netfs_cache_resources *cres, __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); get_file(ki->iocb.ki_filp); + cachefiles_grab_object(object, cachefiles_obj_get_ioreq); + trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); - ret = vfs_iocb_iter_write(file, &ki->iocb, iter); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_iocb_iter_write(file, &ki->iocb, iter); memalloc_nofs_restore(old_nofs); switch (ret) { case -EIOCBQUEUED: @@ -257,8 +374,8 @@ in_progress: presubmission_error: if (term_func) - term_func(term_func_priv, -ENOMEM, false); - return -ENOMEM; + term_func(term_func_priv, ret, false); + return ret; } /* @@ -268,47 +385,82 @@ presubmission_error: static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq, loff_t i_size) { - struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv; + enum cachefiles_prepare_read_trace why; + struct netfs_read_request *rreq = subreq->rreq; + struct netfs_cache_resources *cres = &rreq->cache_resources; struct cachefiles_object *object; struct cachefiles_cache *cache; + struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; - struct file *file = subreq->rreq->cache_resources.cache_priv2; + struct file *file = cachefiles_cres_file(cres); + enum netfs_read_source ret = NETFS_DOWNLOAD_FROM_SERVER; loff_t off, to; + ino_t ino = file ? file_inode(file)->i_ino : 0; _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); + if (subreq->start >= i_size) { + ret = NETFS_FILL_WITH_ZEROES; + why = cachefiles_trace_read_after_eof; + goto out_no_object; + } - if (!file) - goto cache_fail_nosec; + if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { + __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + why = cachefiles_trace_read_no_data; + goto out_no_object; + } - if (subreq->start >= i_size) - return NETFS_FILL_WITH_ZEROES; + /* The object and the file may be being created in the background. */ + if (!file) { + why = cachefiles_trace_read_no_file; + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + goto out_no_object; + file = cachefiles_cres_file(cres); + if (!file) + goto out_no_object; + ino = file_inode(file)->i_ino; + } + object = cachefiles_cres_object(cres); + cache = object->volume->cache; cachefiles_begin_secure(cache, &saved_cred); - off = vfs_llseek(file, subreq->start, SEEK_DATA); + off = cachefiles_inject_read_error(); + if (off == 0) + off = vfs_llseek(file, subreq->start, SEEK_DATA); if (off < 0 && off >= (loff_t)-MAX_ERRNO) { - if (off == (loff_t)-ENXIO) + if (off == (loff_t)-ENXIO) { + why = cachefiles_trace_read_seek_nxio; goto download_and_store; - goto cache_fail; + } + trace_cachefiles_io_error(object, file_inode(file), off, + cachefiles_trace_seek_error); + why = cachefiles_trace_read_seek_error; + goto out; } - if (off >= subreq->start + subreq->len) + if (off >= subreq->start + subreq->len) { + why = cachefiles_trace_read_found_hole; goto download_and_store; + } if (off > subreq->start) { off = round_up(off, cache->bsize); subreq->len = off - subreq->start; + why = cachefiles_trace_read_found_part; goto download_and_store; } - to = vfs_llseek(file, subreq->start, SEEK_HOLE); - if (to < 0 && to >= (loff_t)-MAX_ERRNO) - goto cache_fail; + to = cachefiles_inject_read_error(); + if (to == 0) + to = vfs_llseek(file, subreq->start, SEEK_HOLE); + if (to < 0 && to >= (loff_t)-MAX_ERRNO) { + trace_cachefiles_io_error(object, file_inode(file), to, + cachefiles_trace_seek_error); + why = cachefiles_trace_read_seek_error; + goto out; + } if (to < subreq->start + subreq->len) { if (subreq->start + subreq->len >= i_size) @@ -318,32 +470,119 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque subreq->len = to - subreq->start; } - cachefiles_end_secure(cache, saved_cred); - return NETFS_READ_FROM_CACHE; + why = cachefiles_trace_read_have_data; + ret = NETFS_READ_FROM_CACHE; + goto out; download_and_store: - if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0) - __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); -cache_fail: + __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); +out: cachefiles_end_secure(cache, saved_cred); -cache_fail_nosec: - return NETFS_DOWNLOAD_FROM_SERVER; +out_no_object: + trace_cachefiles_prep_read(subreq, ret, why, ino); + return ret; } /* * Prepare for a write to occur. */ -static int cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size) +static int __cachefiles_prepare_write(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size, + bool no_space_allocated_yet) { - loff_t start = *_start; + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; + struct file *file = cachefiles_cres_file(cres); + loff_t start = *_start, pos; size_t len = *_len, down; + int ret; /* Round to DIO size */ down = start - round_down(start, PAGE_SIZE); *_start = start - down; *_len = round_up(down + len, PAGE_SIZE); - return 0; + + /* We need to work out whether there's sufficient disk space to perform + * the write - but we can skip that check if we have space already + * allocated. + */ + if (no_space_allocated_yet) + goto check_space; + + pos = cachefiles_inject_read_error(); + if (pos == 0) + pos = vfs_llseek(file, *_start, SEEK_DATA); + if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { + if (pos == -ENXIO) + goto check_space; /* Unallocated tail */ + trace_cachefiles_io_error(object, file_inode(file), pos, + cachefiles_trace_seek_error); + return pos; + } + if ((u64)pos >= (u64)*_start + *_len) + goto check_space; /* Unallocated region */ + + /* We have a block that's at least partially filled - if we're low on + * space, we need to see if it's fully allocated. If it's not, we may + * want to cull it. + */ + if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE, + cachefiles_has_space_check) == 0) + return 0; /* Enough space to simply overwrite the whole block */ + + pos = cachefiles_inject_read_error(); + if (pos == 0) + pos = vfs_llseek(file, *_start, SEEK_HOLE); + if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { + trace_cachefiles_io_error(object, file_inode(file), pos, + cachefiles_trace_seek_error); + return pos; + } + if ((u64)pos >= (u64)*_start + *_len) + return 0; /* Fully allocated */ + + /* Partially allocated, but insufficient space: cull. */ + fscache_count_no_write_space(); + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + *_start, *_len); + if (ret < 0) { + trace_cachefiles_io_error(object, file_inode(file), ret, + cachefiles_trace_fallocate_error); + cachefiles_io_error_obj(object, + "CacheFiles: fallocate failed (%d)\n", ret); + ret = -EIO; + } + + return ret; + +check_space: + return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE, + cachefiles_has_space_for_write); +} + +static int cachefiles_prepare_write(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size, + bool no_space_allocated_yet) +{ + struct cachefiles_object *object = cachefiles_cres_object(cres); + struct cachefiles_cache *cache = object->volume->cache; + const struct cred *saved_cred; + int ret; + + if (!cachefiles_cres_file(cres)) { + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) + return -ENOBUFS; + if (!cachefiles_cres_file(cres)) + return -ENOBUFS; + } + + cachefiles_begin_secure(cache, &saved_cred); + ret = __cachefiles_prepare_write(cres, _start, _len, i_size, + no_space_allocated_yet); + cachefiles_end_secure(cache, saved_cred); + return ret; } /* @@ -351,19 +590,11 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, */ static void cachefiles_end_operation(struct netfs_cache_resources *cres) { - struct fscache_retrieval *op = cres->cache_priv; - struct file *file = cres->cache_priv2; - - _enter(""); + struct file *file = cachefiles_cres_file(cres); if (file) fput(file); - if (op) { - fscache_op_complete(&op->op, false); - fscache_put_retrieval(op); - } - - _leave(""); + fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end); } static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { @@ -372,49 +603,31 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .query_occupancy = cachefiles_query_occupancy, }; /* * Open the cache file when beginning a cache operation. */ -int cachefiles_begin_read_operation(struct netfs_read_request *rreq, - struct fscache_retrieval *op) +bool cachefiles_begin_operation(struct netfs_cache_resources *cres, + enum fscache_want_state want_state) { - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct path path; - struct file *file; - - _enter(""); - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - path.mnt = cache->mnt; - path.dentry = object->backer; - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_inode(object->backer), cache->cache_cred); - if (IS_ERR(file)) - return PTR_ERR(file); - if (!S_ISREG(file_inode(file)->i_mode)) - goto error_file; - if (unlikely(!file->f_op->read_iter) || - unlikely(!file->f_op->write_iter)) { - pr_notice("Cache does not support read_iter and write_iter\n"); - goto error_file; + struct cachefiles_object *object = cachefiles_cres_object(cres); + + if (!cachefiles_cres_file(cres)) { + cres->ops = &cachefiles_netfs_cache_ops; + if (object->file) { + spin_lock(&object->lock); + if (!cres->cache_priv2 && object->file) + cres->cache_priv2 = get_file(object->file); + spin_unlock(&object->lock); + } } - fscache_get_retrieval(op); - rreq->cache_resources.cache_priv = op; - rreq->cache_resources.cache_priv2 = file; - rreq->cache_resources.ops = &cachefiles_netfs_cache_ops; - rreq->cache_resources.debug_id = object->fscache.debug_id; - _leave(""); - return 0; + if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) { + pr_err("failed to get cres->file\n"); + return false; + } -error_file: - fput(file); - return -EIO; + return true; } diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index 7f94efc97e23..bf935e25bdbe 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* Key to pathname encoder * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -22,134 +22,117 @@ static const char cachefiles_filecharmap[256] = { [48 ... 127] = 1, /* '0' -> '~' */ }; +static inline unsigned int how_many_hex_digits(unsigned int x) +{ + return x ? round_up(ilog2(x) + 1, 4) / 4 : 0; +} + /* * turn the raw key into something cooked - * - the raw key should include the length in the two bytes at the front - * - the key may be up to 514 bytes in length (including the length word) + * - the key may be up to NAME_MAX in length (including the length word) * - "base64" encode the strange keys, mapping 3 bytes of raw to four of * cooked * - need to cut the cooked key into 252 char lengths (189 raw bytes) */ -char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) +bool cachefiles_cook_key(struct cachefiles_object *object) { - unsigned char csum, ch; - unsigned int acc; - char *key; - int loop, len, max, seg, mark, print; + const u8 *key = fscache_get_key(object->cookie), *kend; + unsigned char ch; + unsigned int acc, i, n, nle, nbe, keylen = object->cookie->key_len; + unsigned int b64len, len, print, pad; + char *name, sep; - _enter(",%d", keylen); + _enter(",%u,%*phN", keylen, keylen, key); - BUG_ON(keylen < 2 || keylen > 514); + BUG_ON(keylen > NAME_MAX - 3); - csum = raw[0] + raw[1]; print = 1; - for (loop = 2; loop < keylen; loop++) { - ch = raw[loop]; - csum += ch; + for (i = 0; i < keylen; i++) { + ch = key[i]; print &= cachefiles_filecharmap[ch]; } + /* If the path is usable ASCII, then we render it directly */ if (print) { - /* if the path is usable ASCII, then we render it directly */ - max = keylen - 2; - max += 2; /* two base64'd length chars on the front */ - max += 5; /* @checksum/M */ - max += 3 * 2; /* maximum number of segment dividers (".../M") - * is ((514 + 251) / 252) = 3 - */ - max += 1; /* NUL on end */ - } else { - /* calculate the maximum length of the cooked key */ - keylen = (keylen + 2) / 3; - - max = keylen * 4; - max += 5; /* @checksum/M */ - max += 3 * 2; /* maximum number of segment dividers (".../M") - * is ((514 + 188) / 189) = 3 - */ - max += 1; /* NUL on end */ + len = 1 + keylen; + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return false; + + name[0] = 'D'; /* Data object type, string encoding */ + memcpy(name + 1, key, keylen); + goto success; } - max += 1; /* 2nd NUL on end */ - - _debug("max: %d", max); - - key = kmalloc(max, cachefiles_gfp); - if (!key) - return NULL; - - len = 0; - - /* build the cooked key */ - sprintf(key, "@%02x%c+", (unsigned) csum, 0); - len = 5; - mark = len - 1; - - if (print) { - acc = *(uint16_t *) raw; - raw += 2; - - key[len + 1] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len] = cachefiles_charmap[acc & 63]; - len += 2; - - seg = 250; - for (loop = keylen; loop > 0; loop--) { - if (seg <= 0) { - key[len++] = '\0'; - mark = len; - key[len++] = '+'; - seg = 252; - } - - key[len++] = *raw++; - ASSERT(len < max); - } - - switch (type) { - case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break; - case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break; - default: type = 'S'; break; - } - } else { - seg = 252; - for (loop = keylen; loop > 0; loop--) { - if (seg <= 0) { - key[len++] = '\0'; - mark = len; - key[len++] = '+'; - seg = 252; - } - - acc = *raw++; - acc |= *raw++ << 8; - acc |= *raw++ << 16; - - _debug("acc: %06x", acc); - - key[len++] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len++] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len++] = cachefiles_charmap[acc & 63]; - acc >>= 6; - key[len++] = cachefiles_charmap[acc & 63]; - - ASSERT(len < max); - } + /* See if it makes sense to encode it as "hex,hex,hex" for each 32-bit + * chunk. We rely on the key having been padded out to a whole number + * of 32-bit words. + */ + n = round_up(keylen, 4); + nbe = nle = 0; + for (i = 0; i < n; i += 4) { + u32 be = be32_to_cpu(*(__be32 *)(key + i)); + u32 le = le32_to_cpu(*(__le32 *)(key + i)); + + nbe += 1 + how_many_hex_digits(be); + nle += 1 + how_many_hex_digits(le); + } - switch (type) { - case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break; - case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break; - default: type = 'T'; break; + b64len = DIV_ROUND_UP(keylen, 3); + pad = b64len * 3 - keylen; + b64len = 2 + b64len * 4; /* Length if we base64-encode it */ + _debug("len=%u nbe=%u nle=%u b64=%u", keylen, nbe, nle, b64len); + if (nbe < b64len || nle < b64len) { + unsigned int nlen = min(nbe, nle) + 1; + name = kmalloc(nlen, GFP_KERNEL); + if (!name) + return false; + sep = (nbe <= nle) ? 'S' : 'T'; /* Encoding indicator */ + len = 0; + for (i = 0; i < n; i += 4) { + u32 x; + if (nbe <= nle) + x = be32_to_cpu(*(__be32 *)(key + i)); + else + x = le32_to_cpu(*(__le32 *)(key + i)); + name[len++] = sep; + if (x != 0) + len += snprintf(name + len, nlen - len, "%x", x); + sep = ','; } + goto success; } - key[mark] = type; - key[len++] = 0; - key[len] = 0; + /* We need to base64-encode it */ + name = kmalloc(b64len + 1, GFP_KERNEL); + if (!name) + return false; + + name[0] = 'E'; + name[1] = '0' + pad; + len = 2; + kend = key + keylen; + do { + acc = *key++; + if (key < kend) { + acc |= *key++ << 8; + if (key < kend) + acc |= *key++ << 16; + } - _leave(" = %s %d", key, len); - return key; + name[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + name[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + name[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + name[len++] = cachefiles_charmap[acc & 63]; + } while (key < kend); + +success: + name[len] = 0; + object->d_name = name; + object->d_name_len = len; + _leave(" = %s", object->d_name); + return true; } diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 9c8d34c49b12..3f369c6f816d 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -2,7 +2,7 @@ /* Network filesystem caching backend to use cache files on a premounted * filesystem * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -18,6 +18,8 @@ #include <linux/statfs.h> #include <linux/sysctl.h> #include <linux/miscdevice.h> +#include <linux/netfs.h> +#include <trace/events/netfs.h> #define CREATE_TRACE_POINTS #include "internal.h" @@ -37,14 +39,6 @@ static struct miscdevice cachefiles_dev = { .fops = &cachefiles_daemon_fops, }; -static void cachefiles_object_init_once(void *_object) -{ - struct cachefiles_object *object = _object; - - memset(object, 0, sizeof(*object)); - spin_lock_init(&object->work_lock); -} - /* * initialise the fs caching module */ @@ -52,6 +46,9 @@ static int __init cachefiles_init(void) { int ret; + ret = cachefiles_register_error_injection(); + if (ret < 0) + goto error_einj; ret = misc_register(&cachefiles_dev); if (ret < 0) goto error_dev; @@ -61,9 +58,7 @@ static int __init cachefiles_init(void) cachefiles_object_jar = kmem_cache_create("cachefiles_object_jar", sizeof(struct cachefiles_object), - 0, - SLAB_HWCACHE_ALIGN, - cachefiles_object_init_once); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!cachefiles_object_jar) { pr_notice("Failed to allocate an object jar\n"); goto error_object_jar; @@ -75,6 +70,8 @@ static int __init cachefiles_init(void) error_object_jar: misc_deregister(&cachefiles_dev); error_dev: + cachefiles_unregister_error_injection(); +error_einj: pr_err("failed to register: %d\n", ret); return ret; } @@ -90,6 +87,7 @@ static void __exit cachefiles_exit(void) kmem_cache_destroy(cachefiles_object_jar); misc_deregister(&cachefiles_dev); + cachefiles_unregister_error_injection(); } module_exit(cachefiles_exit); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index a9aca5ab5970..f256c8aff7bb 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -1,295 +1,272 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles path walking and related routines * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/file.h> #include <linux/fs.h> -#include <linux/fsnotify.h> -#include <linux/quotaops.h> -#include <linux/xattr.h> -#include <linux/mount.h> #include <linux/namei.h> -#include <linux/security.h> -#include <linux/slab.h> #include "internal.h" -#define CACHEFILES_KEYBUF_SIZE 512 - /* - * dump debugging info about an object + * Mark the backing file as being a cache file if it's not already in use. The + * mark tells the culling request command that it's not allowed to cull the + * file or directory. The caller must hold the inode lock. */ -static noinline -void __cachefiles_printk_object(struct cachefiles_object *object, - const char *prefix) +static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) { - struct fscache_cookie *cookie; - const u8 *k; - unsigned loop; - - pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); - pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", - prefix, object->fscache.state->name, - object->fscache.flags, work_busy(&object->fscache.work), - object->fscache.events, object->fscache.event_mask); - pr_err("%sops=%u inp=%u exc=%u\n", - prefix, object->fscache.n_ops, object->fscache.n_in_progress, - object->fscache.n_exclusive); - pr_err("%sparent=%x\n", - prefix, object->fscache.parent ? object->fscache.parent->debug_id : 0); - - spin_lock(&object->fscache.lock); - cookie = object->fscache.cookie; - if (cookie) { - pr_err("%scookie=%x [pr=%x nd=%p fl=%lx]\n", - prefix, - cookie->debug_id, - cookie->parent ? cookie->parent->debug_id : 0, - cookie->netfs_data, - cookie->flags); - pr_err("%skey=[%u] '", prefix, cookie->key_len); - k = (cookie->key_len <= sizeof(cookie->inline_key)) ? - cookie->inline_key : cookie->key; - for (loop = 0; loop < cookie->key_len; loop++) - pr_cont("%02x", k[loop]); - pr_cont("'\n"); + struct inode *inode = d_backing_inode(dentry); + bool can_use = false; + + if (!(inode->i_flags & S_KERNEL_FILE)) { + inode->i_flags |= S_KERNEL_FILE; + trace_cachefiles_mark_active(object, inode); + can_use = true; } else { - pr_err("%scookie=NULL\n", prefix); + trace_cachefiles_mark_failed(object, inode); + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, inode->i_ino); } - spin_unlock(&object->fscache.lock); + + return can_use; } -/* - * dump debugging info about a pair of objects - */ -static noinline void cachefiles_printk_object(struct cachefiles_object *object, - struct cachefiles_object *xobject) +static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) { - if (object) - __cachefiles_printk_object(object, ""); - if (xobject) - __cachefiles_printk_object(xobject, "x"); + struct inode *inode = d_backing_inode(dentry); + bool can_use; + + inode_lock(inode); + can_use = __cachefiles_mark_inode_in_use(object, dentry); + inode_unlock(inode); + return can_use; } /* - * mark the owner of a dentry, if there is one, to indicate that that dentry - * has been preemptively deleted - * - the caller must hold the i_mutex on the dentry's parent as required to - * call vfs_unlink(), vfs_rmdir() or vfs_rename() + * Unmark a backing inode. The caller must hold the inode lock. */ -static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry, - enum fscache_why_object_killed why) +static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) { - struct cachefiles_object *object; - struct rb_node *p; - - _enter(",'%pd'", dentry); + struct inode *inode = d_backing_inode(dentry); - write_lock(&cache->active_lock); + inode->i_flags &= ~S_KERNEL_FILE; + trace_cachefiles_mark_inactive(object, inode); +} - p = cache->active_nodes.rb_node; - while (p) { - object = rb_entry(p, struct cachefiles_object, active_node); - if (object->dentry > dentry) - p = p->rb_left; - else if (object->dentry < dentry) - p = p->rb_right; - else - goto found_dentry; +/* + * Unmark a backing inode and tell cachefilesd that there's something that can + * be culled. + */ +void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, + struct file *file) +{ + struct cachefiles_cache *cache = object->volume->cache; + struct inode *inode = file_inode(file); + + if (inode) { + inode_lock(inode); + __cachefiles_unmark_inode_in_use(object, file->f_path.dentry); + inode_unlock(inode); + + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + atomic_long_add(inode->i_blocks, &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); + } } +} - write_unlock(&cache->active_lock); - trace_cachefiles_mark_buried(NULL, dentry, why); - _leave(" [no owner]"); - return; +/* + * get a subdirectory + */ +struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *dirname, + bool *_is_new) +{ + struct dentry *subdir; + struct path path; + int ret; - /* found the dentry for */ -found_dentry: - kdebug("preemptive burial: OBJ%x [%s] %pd", - object->fscache.debug_id, - object->fscache.state->name, - dentry); + _enter(",,%s", dirname); - trace_cachefiles_mark_buried(object, dentry, why); + /* search the current directory for the element name */ + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - if (fscache_object_is_live(&object->fscache)) { - pr_err("\n"); - pr_err("Error: Can't preemptively bury live object\n"); - cachefiles_printk_object(object, NULL); - } else { - if (why != FSCACHE_OBJECT_IS_STALE) - fscache_object_mark_killed(&object->fscache, why); +retry: + ret = cachefiles_inject_read_error(); + if (ret == 0) + subdir = lookup_one_len(dirname, dir, strlen(dirname)); + else + subdir = ERR_PTR(ret); + trace_cachefiles_lookup(NULL, dir, subdir); + if (IS_ERR(subdir)) { + trace_cachefiles_vfs_error(NULL, d_backing_inode(dir), + PTR_ERR(subdir), + cachefiles_trace_lookup_error); + if (PTR_ERR(subdir) == -ENOMEM) + goto nomem_d_alloc; + goto lookup_error; } - write_unlock(&cache->active_lock); - _leave(" [owner marked]"); -} + _debug("subdir -> %pd %s", + subdir, d_backing_inode(subdir) ? "positive" : "negative"); -/* - * record the fact that an object is now active - */ -static int cachefiles_mark_object_active(struct cachefiles_cache *cache, - struct cachefiles_object *object) -{ - struct cachefiles_object *xobject; - struct rb_node **_p, *_parent = NULL; - struct dentry *dentry; + /* we need to create the subdir if it doesn't exist yet */ + if (d_is_negative(subdir)) { + ret = cachefiles_has_space(cache, 1, 0, + cachefiles_has_space_for_create); + if (ret < 0) + goto mkdir_error; - _enter(",%x", object->fscache.debug_id); + _debug("attempt mkdir"); -try_again: - write_lock(&cache->active_lock); + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_mkdir(&path, subdir, 0700); + if (ret < 0) + goto mkdir_error; + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); + if (ret < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, + cachefiles_trace_mkdir_error); + goto mkdir_error; + } + trace_cachefiles_mkdir(dir, subdir); - dentry = object->dentry; - trace_cachefiles_mark_active(object, dentry); + if (unlikely(d_unhashed(subdir))) { + cachefiles_put_directory(subdir); + goto retry; + } + ASSERT(d_backing_inode(subdir)); - if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { - pr_err("Error: Object already active\n"); - cachefiles_printk_object(object, NULL); - BUG(); + _debug("mkdir -> %pd{ino=%lu}", + subdir, d_backing_inode(subdir)->i_ino); + if (_is_new) + *_is_new = true; } - _p = &cache->active_nodes.rb_node; - while (*_p) { - _parent = *_p; - xobject = rb_entry(_parent, - struct cachefiles_object, active_node); + /* Tell rmdir() it's not allowed to delete the subdir */ + inode_lock(d_inode(subdir)); + inode_unlock(d_inode(dir)); - ASSERT(xobject != object); + if (!__cachefiles_mark_inode_in_use(NULL, subdir)) + goto mark_error; - if (xobject->dentry > dentry) - _p = &(*_p)->rb_left; - else if (xobject->dentry < dentry) - _p = &(*_p)->rb_right; - else - goto wait_for_old_object; - } + inode_unlock(d_inode(subdir)); - rb_link_node(&object->active_node, _parent, _p); - rb_insert_color(&object->active_node, &cache->active_nodes); + /* we need to make sure the subdir is a directory */ + ASSERT(d_backing_inode(subdir)); - write_unlock(&cache->active_lock); - _leave(" = 0"); - return 0; + if (!d_can_lookup(subdir)) { + pr_err("%s is not a directory\n", dirname); + ret = -EIO; + goto check_error; + } - /* an old object from a previous incarnation is hogging the slot - we - * need to wait for it to be destroyed */ -wait_for_old_object: - trace_cachefiles_wait_active(object, dentry, xobject); - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - - if (fscache_object_is_live(&xobject->fscache)) { - pr_err("\n"); - pr_err("Error: Unexpected object collision\n"); - cachefiles_printk_object(object, xobject); - } - atomic_inc(&xobject->usage); - write_unlock(&cache->active_lock); - - if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { - wait_queue_head_t *wq; - - signed long timeout = 60 * HZ; - wait_queue_entry_t wait; - bool requeue; - - /* if the object we're waiting for is queued for processing, - * then just put ourselves on the queue behind it */ - if (work_pending(&xobject->fscache.work)) { - _debug("queue OBJ%x behind OBJ%x immediately", - object->fscache.debug_id, - xobject->fscache.debug_id); - goto requeue; - } + ret = -EPERM; + if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) || + !d_backing_inode(subdir)->i_op->lookup || + !d_backing_inode(subdir)->i_op->mkdir || + !d_backing_inode(subdir)->i_op->rename || + !d_backing_inode(subdir)->i_op->rmdir || + !d_backing_inode(subdir)->i_op->unlink) + goto check_error; - /* otherwise we sleep until either the object we're waiting for - * is done, or the fscache_object is congested */ - wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); - init_wait(&wait); - requeue = false; - do { - prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); - if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) - break; - - requeue = fscache_object_sleep_till_congested(&timeout); - } while (timeout > 0 && !requeue); - finish_wait(wq, &wait); - - if (requeue && - test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { - _debug("queue OBJ%x behind OBJ%x after wait", - object->fscache.debug_id, - xobject->fscache.debug_id); - goto requeue; - } + _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); + return subdir; - if (timeout <= 0) { - pr_err("\n"); - pr_err("Error: Overlong wait for old active object to go away\n"); - cachefiles_printk_object(object, xobject); - goto requeue; - } - } +check_error: + cachefiles_put_directory(subdir); + _leave(" = %d [check]", ret); + return ERR_PTR(ret); - ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); +mark_error: + inode_unlock(d_inode(subdir)); + dput(subdir); + return ERR_PTR(-EBUSY); - cache->cache.ops->put_object(&xobject->fscache, - (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry); - goto try_again; +mkdir_error: + inode_unlock(d_inode(dir)); + dput(subdir); + pr_err("mkdir %s failed with error %d\n", dirname, ret); + return ERR_PTR(ret); + +lookup_error: + inode_unlock(d_inode(dir)); + ret = PTR_ERR(subdir); + pr_err("Lookup %s failed with error %d\n", dirname, ret); + return ERR_PTR(ret); -requeue: - cache->cache.ops->put_object(&xobject->fscache, - (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo); - _leave(" = -ETIMEDOUT"); - return -ETIMEDOUT; +nomem_d_alloc: + inode_unlock(d_inode(dir)); + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); } /* - * Mark an object as being inactive. + * Put a subdirectory. */ -void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object, - blkcnt_t i_blocks) +void cachefiles_put_directory(struct dentry *dir) { - struct dentry *dentry = object->dentry; - struct inode *inode = d_backing_inode(dentry); - - trace_cachefiles_mark_inactive(object, dentry, inode); + if (dir) { + inode_lock(dir->d_inode); + __cachefiles_unmark_inode_in_use(NULL, dir); + inode_unlock(dir->d_inode); + dput(dir); + } +} - write_lock(&cache->active_lock); - rb_erase(&object->active_node, &cache->active_nodes); - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - write_unlock(&cache->active_lock); +/* + * Remove a regular file from the cache. + */ +static int cachefiles_unlink(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dir, struct dentry *dentry, + enum fscache_why_object_killed why) +{ + struct path path = { + .mnt = cache->mnt, + .dentry = dir, + }; + int ret; - wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + trace_cachefiles_unlink(object, d_inode(dentry)->i_ino, why); + ret = security_path_unlink(&path, dentry); + if (ret < 0) { + cachefiles_io_error(cache, "Unlink security error"); + return ret; + } - /* This object can now be culled, so we need to let the daemon know - * that there is something it can remove if it needs to. - */ - atomic_long_add(i_blocks, &cache->b_released); - if (atomic_inc_return(&cache->f_released)) - cachefiles_state_changed(cache); + ret = cachefiles_inject_remove_error(); + if (ret == 0) { + ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL); + if (ret == -EIO) + cachefiles_io_error(cache, "Unlink failed"); + } + if (ret != 0) + trace_cachefiles_vfs_error(object, d_backing_inode(dir), ret, + cachefiles_trace_unlink_error); + return ret; } /* - * delete an object representation from the cache - * - file backed objects are unlinked - * - directory backed objects are stuffed into the graveyard for userspace to + * Delete an object representation from the cache + * - File backed objects are unlinked + * - Directory backed objects are stuffed into the graveyard for userspace to * delete - * - unlocks the directory mutex */ -static int cachefiles_bury_object(struct cachefiles_cache *cache, - struct cachefiles_object *object, - struct dentry *dir, - struct dentry *rep, - bool preemptive, - enum fscache_why_object_killed why) +int cachefiles_bury_object(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dir, + struct dentry *rep, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -298,29 +275,21 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); + if (rep->d_parent != dir) { + inode_unlock(d_inode(dir)); + _leave(" = -ESTALE"); + return -ESTALE; + } + /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { - _debug("unlink stale object"); - - path.mnt = cache->mnt; - path.dentry = dir; - ret = security_path_unlink(&path, rep); - if (ret < 0) { - cachefiles_io_error(cache, "Unlink security error"); - } else { - trace_cachefiles_unlink(object, rep, why); - ret = vfs_unlink(&init_user_ns, d_inode(dir), rep, - NULL); - - if (preemptive) - cachefiles_mark_object_buried(cache, rep, why); - } + dget(rep); /* Stop the dentry being negated if it's only pinned + * by a file struct. + */ + ret = cachefiles_unlink(cache, object, dir, rep, why); + dput(rep); inode_unlock(d_inode(dir)); - - if (ret == -EIO) - cachefiles_io_error(cache, "Unlink failed"); - _leave(" = %d", ret); return ret; } @@ -368,14 +337,16 @@ try_again: grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); if (IS_ERR(grave)) { unlock_rename(cache->graveyard, dir); + trace_cachefiles_vfs_error(object, d_inode(cache->graveyard), + PTR_ERR(grave), + cachefiles_trace_lookup_error); if (PTR_ERR(grave) == -ENOMEM) { _leave(" = -ENOMEM"); return -ENOMEM; } - cachefiles_io_error(cache, "Lookup error %ld", - PTR_ERR(grave)); + cachefiles_io_error(cache, "Lookup error %ld", PTR_ERR(grave)); return -EIO; } @@ -419,16 +390,19 @@ try_again: .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; - trace_cachefiles_rename(object, rep, grave, why); - ret = vfs_rename(&rd); + trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); + ret = cachefiles_inject_read_error(); + if (ret == 0) + ret = vfs_rename(&rd); + if (ret != 0) + trace_cachefiles_vfs_error(object, d_inode(dir), ret, + cachefiles_trace_rename_error); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); - - if (preemptive) - cachefiles_mark_object_buried(cache, rep, why); } + __cachefiles_unmark_inode_in_use(object, rep); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -436,493 +410,358 @@ try_again: } /* - * delete an object representation from the cache + * Delete a cache file. */ -int cachefiles_delete_object(struct cachefiles_cache *cache, - struct cachefiles_object *object) +int cachefiles_delete_object(struct cachefiles_object *object, + enum fscache_why_object_killed why) { - struct dentry *dir; + struct cachefiles_volume *volume = object->volume; + struct dentry *dentry = object->file->f_path.dentry; + struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; int ret; - _enter(",OBJ%x{%pd}", object->fscache.debug_id, object->dentry); - - ASSERT(object->dentry); - ASSERT(d_backing_inode(object->dentry)); - ASSERT(object->dentry->d_parent); + _enter(",OBJ%x{%pD}", object->debug_id, object->file); - dir = dget_parent(object->dentry); + /* Stop the dentry being negated if it's only pinned by a file struct. */ + dget(dentry); - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - - if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { - /* object allocation for the same key preemptively deleted this - * object's file so that it could create its own file */ - _debug("object preemptively buried"); - inode_unlock(d_inode(dir)); - ret = 0; - } else { - /* we need to check that our parent is _still_ our parent - it - * may have been renamed */ - if (dir == object->dentry->d_parent) { - ret = cachefiles_bury_object(cache, object, dir, - object->dentry, false, - FSCACHE_OBJECT_WAS_RETIRED); - } else { - /* it got moved, presumably by cachefilesd culling it, - * so it's no longer in the key path and we can ignore - * it */ - inode_unlock(d_inode(dir)); - ret = 0; - } - } - - dput(dir); - _leave(" = %d", ret); + inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT); + ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); + inode_unlock(d_backing_inode(fan)); + dput(dentry); return ret; } /* - * walk from the parent object to the child object through the backing - * filesystem, creating directories as we go + * Create a temporary file and leave it unattached and un-xattr'd until the + * time comes to discard the object from memory. */ -int cachefiles_walk_to_object(struct cachefiles_object *parent, - struct cachefiles_object *object, - const char *key, - struct cachefiles_xattr *auxdata) +struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) { - struct cachefiles_cache *cache; - struct dentry *dir, *next = NULL; - struct inode *inode; + struct cachefiles_volume *volume = object->volume; + struct cachefiles_cache *cache = volume->cache; + const struct cred *saved_cred; + struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; + struct file *file; struct path path; - const char *name; - int ret, nlen; - - _enter("OBJ%x{%pd},OBJ%x,%s,", - parent->fscache.debug_id, parent->dentry, - object->fscache.debug_id, key); - - cache = container_of(parent->fscache.cache, - struct cachefiles_cache, cache); - path.mnt = cache->mnt; - - ASSERT(parent->dentry); - ASSERT(d_backing_inode(parent->dentry)); - - if (!(d_is_dir(parent->dentry))) { - // TODO: convert file to dir - _leave("looking up in none directory"); - return -ENOBUFS; - } - - dir = dget(parent->dentry); - -advance: - /* attempt to transit the first directory component */ - name = key; - nlen = strlen(key); - - /* key ends in a double NUL */ - key = key + nlen + 1; - if (!*key) - key = NULL; - -lookup_again: - /* search the current directory for the element name */ - _debug("lookup '%s'", name); - - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + uint64_t ni_size = object->cookie->object_size; + long ret; - next = lookup_one_len(name, dir, nlen); - if (IS_ERR(next)) { - trace_cachefiles_lookup(object, next, NULL); - goto lookup_error; - } + ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); - inode = d_backing_inode(next); - trace_cachefiles_lookup(object, next, inode); - _debug("next -> %pd %s", next, inode ? "positive" : "negative"); - - if (!key) - object->new = !inode; - - /* if this element of the path doesn't exist, then the lookup phase - * failed, and we can release any readers in the certain knowledge that - * there's nothing for them to actually read */ - if (d_is_negative(next)) - fscache_object_lookup_negative(&object->fscache); - - /* we need to create the object if it's negative */ - if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { - /* index objects and intervening tree levels must be subdirs */ - if (d_is_negative(next)) { - ret = cachefiles_has_space(cache, 1, 0); - if (ret < 0) - goto no_space_error; - - path.dentry = dir; - ret = security_path_mkdir(&path, next, 0); - if (ret < 0) - goto create_error; - ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0); - if (!key) - trace_cachefiles_mkdir(object, next, ret); - if (ret < 0) - goto create_error; - - if (unlikely(d_unhashed(next))) { - dput(next); - inode_unlock(d_inode(dir)); - goto lookup_again; - } - ASSERT(d_backing_inode(next)); - - _debug("mkdir -> %pd{ino=%lu}", - next, d_backing_inode(next)->i_ino); - - } else if (!d_can_lookup(next)) { - pr_err("inode %lu is not a directory\n", - d_backing_inode(next)->i_ino); - ret = -ENOBUFS; - goto error; - } + cachefiles_begin_secure(cache, &saved_cred); - } else { - /* non-index objects start out life as files */ - if (d_is_negative(next)) { - ret = cachefiles_has_space(cache, 1, 0); - if (ret < 0) - goto no_space_error; - - path.dentry = dir; - ret = security_path_mknod(&path, next, S_IFREG, 0); - if (ret < 0) - goto create_error; - ret = vfs_create(&init_user_ns, d_inode(dir), next, - S_IFREG, true); - trace_cachefiles_create(object, next, ret); - if (ret < 0) - goto create_error; - - ASSERT(d_backing_inode(next)); - - _debug("create -> %pd{ino=%lu}", - next, d_backing_inode(next)->i_ino); - - } else if (!d_can_lookup(next) && - !d_is_reg(next) - ) { - pr_err("inode %lu is not a file or directory\n", - d_backing_inode(next)->i_ino); - ret = -ENOBUFS; - goto error; + path.mnt = cache->mnt; + ret = cachefiles_inject_write_error(); + if (ret == 0) + path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); + else + path.dentry = ERR_PTR(ret); + if (IS_ERR(path.dentry)) { + trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry), + cachefiles_trace_tmpfile_error); + if (PTR_ERR(path.dentry) == -EIO) + cachefiles_io_error_obj(object, "Failed to create tmpfile"); + file = ERR_CAST(path.dentry); + goto out; + } + + trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); + + if (!cachefiles_mark_inode_in_use(object, path.dentry)) { + file = ERR_PTR(-EBUSY); + goto out_dput; + } + + if (ni_size > 0) { + trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, + cachefiles_trunc_expand_tmpfile); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_truncate(&path, ni_size); + if (ret < 0) { + trace_cachefiles_vfs_error( + object, d_backing_inode(path.dentry), ret, + cachefiles_trace_trunc_error); + file = ERR_PTR(ret); + goto out_dput; } } - /* process the next component */ - if (key) { - _debug("advance"); - inode_unlock(d_inode(dir)); - dput(dir); - dir = next; - next = NULL; - goto advance; + file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(path.dentry), cache->cache_cred); + if (IS_ERR(file)) { + trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), + PTR_ERR(file), + cachefiles_trace_open_error); + goto out_dput; + } + if (unlikely(!file->f_op->read_iter) || + unlikely(!file->f_op->write_iter)) { + fput(file); + pr_notice("Cache does not support read_iter and write_iter\n"); + file = ERR_PTR(-EINVAL); } - /* we've found the object we were looking for */ - object->dentry = next; - - /* if we've found that the terminal object exists, then we need to - * check its attributes and delete it if it's out of date */ - if (!object->new) { - _debug("validate '%pd'", next); - - ret = cachefiles_check_object_xattr(object, auxdata); - if (ret == -ESTALE) { - /* delete the object (the deleter drops the directory - * mutex) */ - object->dentry = NULL; +out_dput: + dput(path.dentry); +out: + cachefiles_end_secure(cache, saved_cred); + return file; +} - ret = cachefiles_bury_object(cache, object, dir, next, - true, - FSCACHE_OBJECT_IS_STALE); - dput(next); - next = NULL; +/* + * Create a new file. + */ +static bool cachefiles_create_file(struct cachefiles_object *object) +{ + struct file *file; + int ret; - if (ret < 0) - goto delete_error; + ret = cachefiles_has_space(object->volume->cache, 1, 0, + cachefiles_has_space_for_create); + if (ret < 0) + return false; - _debug("redo lookup"); - fscache_object_retrying_stale(&object->fscache); - goto lookup_again; - } - } + file = cachefiles_create_tmpfile(object); + if (IS_ERR(file)) + return false; - /* note that we're now using this object */ - ret = cachefiles_mark_object_active(cache, object); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); + set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); + _debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino); + object->file = file; + return true; +} - inode_unlock(d_inode(dir)); - dput(dir); - dir = NULL; +/* + * Open an existing file, checking its attributes and replacing it if it is + * stale. + */ +static bool cachefiles_open_file(struct cachefiles_object *object, + struct dentry *dentry) +{ + struct cachefiles_cache *cache = object->volume->cache; + struct file *file; + struct path path; + int ret; - if (ret == -ETIMEDOUT) - goto mark_active_timed_out; + _enter("%pd", dentry); - _debug("=== OBTAINED_OBJECT ==="); + if (!cachefiles_mark_inode_in_use(object, dentry)) + return false; - if (object->new) { - /* attach data to a newly constructed terminal object */ - ret = cachefiles_set_object_xattr(object, auxdata); - if (ret < 0) - goto check_error; - } else { - /* always update the atime on an object we've just looked up - * (this is used to keep track of culling, and atimes are only - * updated by read, write and readdir but not lookup or - * open) */ - path.dentry = next; - touch_atime(&path); - } - - /* open a file interface onto a data file */ - if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (d_is_reg(object->dentry)) { - const struct address_space_operations *aops; - - ret = -EPERM; - aops = d_backing_inode(object->dentry)->i_mapping->a_ops; - if (!aops->bmap) - goto check_error; - if (object->dentry->d_sb->s_blocksize > PAGE_SIZE) - goto check_error; - - object->backer = object->dentry; - } else { - BUG(); // TODO: open file in data-class subdir - } + /* We need to open a file interface onto a data file now as we can't do + * it on demand because writeback called from do_exit() sees + * current->fs == NULL - which breaks d_path() called from ext4 open. + */ + path.mnt = cache->mnt; + path.dentry = dentry; + file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(dentry), cache->cache_cred); + if (IS_ERR(file)) { + trace_cachefiles_vfs_error(object, d_backing_inode(dentry), + PTR_ERR(file), + cachefiles_trace_open_error); + goto error; } - object->new = 0; - fscache_obtained_object(&object->fscache); - - _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); - return 0; - -no_space_error: - fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); -create_error: - _debug("create error %d", ret); - if (ret == -EIO) - cachefiles_io_error(cache, "Create/mkdir failed"); - goto error; + if (unlikely(!file->f_op->read_iter) || + unlikely(!file->f_op->write_iter)) { + pr_notice("Cache does not support read_iter and write_iter\n"); + goto error_fput; + } + _debug("file -> %pd positive", dentry); -mark_active_timed_out: - _debug("mark active timed out"); - goto release_dentry; + ret = cachefiles_check_auxdata(object, file); + if (ret < 0) + goto check_failed; -check_error: - _debug("check error %d", ret); - cachefiles_mark_object_inactive( - cache, object, d_backing_inode(object->dentry)->i_blocks); -release_dentry: - dput(object->dentry); - object->dentry = NULL; - goto error_out; - -delete_error: - _debug("delete error %d", ret); - goto error_out2; + object->file = file; -lookup_error: - _debug("lookup error %ld", PTR_ERR(next)); - ret = PTR_ERR(next); - if (ret == -EIO) - cachefiles_io_error(cache, "Lookup failed"); - next = NULL; + /* Always update the atime on an object we've just looked up (this is + * used to keep track of culling, and atimes are only updated by read, + * write and readdir but not lookup or open). + */ + touch_atime(&file->f_path); + dput(dentry); + return true; + +check_failed: + fscache_cookie_lookup_negative(object->cookie); + cachefiles_unmark_inode_in_use(object, file); + if (ret == -ESTALE) { + fput(file); + dput(dentry); + return cachefiles_create_file(object); + } +error_fput: + fput(file); error: - inode_unlock(d_inode(dir)); - dput(next); -error_out2: - dput(dir); -error_out: - _leave(" = error %d", -ret); - return ret; + dput(dentry); + return false; } /* - * get a subdirectory + * walk from the parent object to the child object through the backing + * filesystem, creating directories as we go */ -struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, - struct dentry *dir, - const char *dirname) +bool cachefiles_look_up_object(struct cachefiles_object *object) { - struct dentry *subdir; - struct path path; + struct cachefiles_volume *volume = object->volume; + struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash]; int ret; - _enter(",,%s", dirname); - - /* search the current directory for the element name */ - inode_lock(d_inode(dir)); - -retry: - subdir = lookup_one_len(dirname, dir, strlen(dirname)); - if (IS_ERR(subdir)) { - if (PTR_ERR(subdir) == -ENOMEM) - goto nomem_d_alloc; - goto lookup_error; + _enter("OBJ%x,%s,", object->debug_id, object->d_name); + + /* Look up path "cache/vol/fanout/file". */ + ret = cachefiles_inject_read_error(); + if (ret == 0) + dentry = lookup_positive_unlocked(object->d_name, fan, + object->d_name_len); + else + dentry = ERR_PTR(ret); + trace_cachefiles_lookup(object, fan, dentry); + if (IS_ERR(dentry)) { + if (dentry == ERR_PTR(-ENOENT)) + goto new_file; + if (dentry == ERR_PTR(-EIO)) + cachefiles_io_error_obj(object, "Lookup failed"); + return false; + } + + if (!d_is_reg(dentry)) { + pr_err("%pd is not a file\n", dentry); + inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); + ret = cachefiles_bury_object(volume->cache, object, fan, dentry, + FSCACHE_OBJECT_IS_WEIRD); + dput(dentry); + if (ret < 0) + return false; + goto new_file; } - _debug("subdir -> %pd %s", - subdir, d_backing_inode(subdir) ? "positive" : "negative"); + if (!cachefiles_open_file(object, dentry)) + return false; - /* we need to create the subdir if it doesn't exist yet */ - if (d_is_negative(subdir)) { - ret = cachefiles_has_space(cache, 1, 0); - if (ret < 0) - goto mkdir_error; + _leave(" = t [%lu]", file_inode(object->file)->i_ino); + return true; - _debug("attempt mkdir"); +new_file: + fscache_cookie_lookup_negative(object->cookie); + return cachefiles_create_file(object); +} - path.mnt = cache->mnt; - path.dentry = dir; - ret = security_path_mkdir(&path, subdir, 0700); - if (ret < 0) - goto mkdir_error; - ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); - if (ret < 0) - goto mkdir_error; +/* + * Attempt to link a temporary file into its rightful place in the cache. + */ +bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct cachefiles_volume *volume = object->volume; + struct dentry *dentry, *fan = volume->fanout[(u8)object->cookie->key_hash]; + bool success = false; + int ret; - if (unlikely(d_unhashed(subdir))) { - dput(subdir); - goto retry; + _enter(",%pD", object->file); + + inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); + ret = cachefiles_inject_read_error(); + if (ret == 0) + dentry = lookup_one_len(object->d_name, fan, object->d_name_len); + else + dentry = ERR_PTR(ret); + if (IS_ERR(dentry)) { + trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), + cachefiles_trace_lookup_error); + _debug("lookup fail %ld", PTR_ERR(dentry)); + goto out_unlock; + } + + if (!d_is_negative(dentry)) { + if (d_backing_inode(dentry) == file_inode(object->file)) { + success = true; + goto out_dput; } - ASSERT(d_backing_inode(subdir)); - _debug("mkdir -> %pd{ino=%lu}", - subdir, d_backing_inode(subdir)->i_ino); - } - - inode_unlock(d_inode(dir)); - - /* we need to make sure the subdir is a directory */ - ASSERT(d_backing_inode(subdir)); + ret = cachefiles_unlink(volume->cache, object, fan, dentry, + FSCACHE_OBJECT_IS_STALE); + if (ret < 0) + goto out_dput; - if (!d_can_lookup(subdir)) { - pr_err("%s is not a directory\n", dirname); - ret = -EIO; - goto check_error; + dput(dentry); + ret = cachefiles_inject_read_error(); + if (ret == 0) + dentry = lookup_one_len(object->d_name, fan, object->d_name_len); + else + dentry = ERR_PTR(ret); + if (IS_ERR(dentry)) { + trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), + cachefiles_trace_lookup_error); + _debug("lookup fail %ld", PTR_ERR(dentry)); + goto out_unlock; + } } - ret = -EPERM; - if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) || - !d_backing_inode(subdir)->i_op->lookup || - !d_backing_inode(subdir)->i_op->mkdir || - !d_backing_inode(subdir)->i_op->create || - !d_backing_inode(subdir)->i_op->rename || - !d_backing_inode(subdir)->i_op->rmdir || - !d_backing_inode(subdir)->i_op->unlink) - goto check_error; - - _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); - return subdir; - -check_error: - dput(subdir); - _leave(" = %d [check]", ret); - return ERR_PTR(ret); - -mkdir_error: - inode_unlock(d_inode(dir)); - dput(subdir); - pr_err("mkdir %s failed with error %d\n", dirname, ret); - return ERR_PTR(ret); - -lookup_error: - inode_unlock(d_inode(dir)); - ret = PTR_ERR(subdir); - pr_err("Lookup %s failed with error %d\n", dirname, ret); - return ERR_PTR(ret); - -nomem_d_alloc: - inode_unlock(d_inode(dir)); - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + ret = cachefiles_inject_read_error(); + if (ret == 0) + ret = vfs_link(object->file->f_path.dentry, &init_user_ns, + d_inode(fan), dentry, NULL); + if (ret < 0) { + trace_cachefiles_vfs_error(object, d_inode(fan), ret, + cachefiles_trace_link_error); + _debug("link fail %d", ret); + } else { + trace_cachefiles_link(object, file_inode(object->file)); + spin_lock(&object->lock); + /* TODO: Do we want to switch the file pointer to the new dentry? */ + clear_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); + spin_unlock(&object->lock); + success = true; + } + +out_dput: + dput(dentry); +out_unlock: + inode_unlock(d_inode(fan)); + _leave(" = %u", success); + return success; } /* - * find out if an object is in use or not - * - if finds object and it's not in use: - * - returns a pointer to the object and a reference on it - * - returns with the directory locked + * Look up an inode to be checked or culled. Return -EBUSY if the inode is + * marked in use. */ -static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, - struct dentry *dir, - char *filename) +static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, + struct dentry *dir, + char *filename) { - struct cachefiles_object *object; - struct rb_node *_n; struct dentry *victim; - int ret; - - //_enter(",%pd/,%s", - // dir, filename); + int ret = -ENOENT; - /* look up the victim */ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); victim = lookup_one_len(filename, dir, strlen(filename)); if (IS_ERR(victim)) goto lookup_error; - - //_debug("victim -> %pd %s", - // victim, d_backing_inode(victim) ? "positive" : "negative"); - - /* if the object is no longer there then we probably retired the object - * at the netfs's request whilst the cull was in progress - */ - if (d_is_negative(victim)) { - inode_unlock(d_inode(dir)); - dput(victim); - _leave(" = -ENOENT [absent]"); - return ERR_PTR(-ENOENT); - } - - /* check to see if we're using this object */ - read_lock(&cache->active_lock); - - _n = cache->active_nodes.rb_node; - - while (_n) { - object = rb_entry(_n, struct cachefiles_object, active_node); - - if (object->dentry > victim) - _n = _n->rb_left; - else if (object->dentry < victim) - _n = _n->rb_right; - else - goto object_in_use; - } - - read_unlock(&cache->active_lock); - - //_leave(" = %pd", victim); + if (d_is_negative(victim)) + goto lookup_put; + if (d_inode(victim)->i_flags & S_KERNEL_FILE) + goto lookup_busy; return victim; -object_in_use: - read_unlock(&cache->active_lock); +lookup_busy: + ret = -EBUSY; +lookup_put: inode_unlock(d_inode(dir)); dput(victim); - //_leave(" = -EBUSY [in use]"); - return ERR_PTR(-EBUSY); + return ERR_PTR(ret); lookup_error: inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); - if (ret == -ENOENT) { - /* file or dir now absent - probably retired by netfs */ - _leave(" = -ESTALE [absent]"); - return ERR_PTR(-ESTALE); - } + if (ret == -ENOENT) + return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */ if (ret == -EIO) { cachefiles_io_error(cache, "Lookup failed"); @@ -931,46 +770,46 @@ lookup_error: ret = -EIO; } - _leave(" = %d", ret); return ERR_PTR(ret); } /* - * cull an object if it's not in use + * Cull an object if it's not in use * - called only by cache manager daemon */ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; + struct inode *inode; int ret; _enter(",%pd/,%s", dir, filename); - victim = cachefiles_check_active(cache, dir, filename); + victim = cachefiles_lookup_for_cull(cache, dir, filename); if (IS_ERR(victim)) return PTR_ERR(victim); - _debug("victim -> %pd %s", - victim, d_backing_inode(victim) ? "positive" : "negative"); - - /* okay... the victim is not being used so we can cull it - * - start by marking it as stale - */ - _debug("victim is cullable"); - - ret = cachefiles_remove_object_xattr(cache, victim); + /* check to see if someone is using this object */ + inode = d_inode(victim); + inode_lock(inode); + if (inode->i_flags & S_KERNEL_FILE) { + ret = -EBUSY; + } else { + /* Stop the cache from picking it back up */ + inode->i_flags |= S_KERNEL_FILE; + ret = 0; + } + inode_unlock(inode); if (ret < 0) goto error_unlock; - /* actually remove the victim (drops the dir mutex) */ - _debug("bury"); - - ret = cachefiles_bury_object(cache, NULL, dir, victim, false, + ret = cachefiles_bury_object(cache, NULL, dir, victim, FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; + fscache_count_culled(); dput(victim); _leave(" = 0"); return 0; @@ -979,11 +818,8 @@ error_unlock: inode_unlock(d_inode(dir)); error: dput(victim); - if (ret == -ENOENT) { - /* file or dir now absent - probably retired by netfs */ - _leave(" = -ESTALE [absent]"); - return -ESTALE; - } + if (ret == -ENOENT) + return -ESTALE; /* Probably got retired by the netfs */ if (ret != -ENOMEM) { pr_err("Internal error: %d\n", ret); @@ -995,7 +831,7 @@ error: } /* - * find out if an object is in use or not + * Find out if an object is in use or not * - called only by cache manager daemon * - returns -EBUSY or 0 to indicate whether an object is in use or not */ @@ -1003,16 +839,13 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename) { struct dentry *victim; + int ret = 0; - //_enter(",%pd/,%s", - // dir, filename); - - victim = cachefiles_check_active(cache, dir, filename); + victim = cachefiles_lookup_for_cull(cache, dir, filename); if (IS_ERR(victim)) return PTR_ERR(victim); inode_unlock(d_inode(dir)); dput(victim); - //_leave(" = 0"); - return 0; + return ret; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c deleted file mode 100644 index fcf4f3b72923..000000000000 --- a/fs/cachefiles/rdwr.c +++ /dev/null @@ -1,972 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Storage object read/write - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/mount.h> -#include <linux/slab.h> -#include <linux/file.h> -#include <linux/swap.h> -#include "internal.h" - -/* - * detect wake up events generated by the unlocking of pages in which we're - * interested - * - we use this to detect read completion of backing pages - * - the caller holds the waitqueue lock - */ -static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, - int sync, void *_key) -{ - struct cachefiles_one_read *monitor = - container_of(wait, struct cachefiles_one_read, monitor); - struct cachefiles_object *object; - struct fscache_retrieval *op = monitor->op; - struct wait_page_key *key = _key; - struct folio *folio = wait->private; - - ASSERT(key); - - _enter("{%lu},%u,%d,{%p,%u}", - monitor->netfs_page->index, mode, sync, - key->folio, key->bit_nr); - - if (key->folio != folio || key->bit_nr != PG_locked) - return 0; - - _debug("--- monitor %p %lx ---", folio, folio->flags); - - if (!folio_test_uptodate(folio) && !folio_test_error(folio)) { - /* unlocked, not uptodate and not erronous? */ - _debug("page probably truncated"); - } - - /* remove from the waitqueue */ - list_del(&wait->entry); - - /* move onto the action list and queue for FS-Cache thread pool */ - ASSERT(op); - - /* We need to temporarily bump the usage count as we don't own a ref - * here otherwise cachefiles_read_copier() may free the op between the - * monitor being enqueued on the op->to_do list and the op getting - * enqueued on the work queue. - */ - fscache_get_retrieval(op); - - object = container_of(op->op.object, struct cachefiles_object, fscache); - spin_lock(&object->work_lock); - list_add_tail(&monitor->op_link, &op->to_do); - fscache_enqueue_retrieval(op); - spin_unlock(&object->work_lock); - - fscache_put_retrieval(op); - return 0; -} - -/* - * handle a probably truncated page - * - check to see if the page is still relevant and reissue the read if - * possible - * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we - * must wait again and 0 if successful - */ -static int cachefiles_read_reissue(struct cachefiles_object *object, - struct cachefiles_one_read *monitor) -{ - struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; - struct page *backpage = monitor->back_page, *backpage2; - int ret; - - _enter("{ino=%lx},{%lx,%lx}", - d_backing_inode(object->backer)->i_ino, - backpage->index, backpage->flags); - - /* skip if the page was truncated away completely */ - if (backpage->mapping != bmapping) { - _leave(" = -ENODATA [mapping]"); - return -ENODATA; - } - - backpage2 = find_get_page(bmapping, backpage->index); - if (!backpage2) { - _leave(" = -ENODATA [gone]"); - return -ENODATA; - } - - if (backpage != backpage2) { - put_page(backpage2); - _leave(" = -ENODATA [different]"); - return -ENODATA; - } - - /* the page is still there and we already have a ref on it, so we don't - * need a second */ - put_page(backpage2); - - INIT_LIST_HEAD(&monitor->op_link); - folio_add_wait_queue(page_folio(backpage), &monitor->monitor); - - if (trylock_page(backpage)) { - ret = -EIO; - if (PageError(backpage)) - goto unlock_discard; - ret = 0; - if (PageUptodate(backpage)) - goto unlock_discard; - - _debug("reissue read"); - ret = bmapping->a_ops->readpage(NULL, backpage); - if (ret < 0) - goto discard; - } - - /* but the page may have been read before the monitor was installed, so - * the monitor may miss the event - so we have to ensure that we do get - * one in such a case */ - if (trylock_page(backpage)) { - _debug("jumpstart %p {%lx}", backpage, backpage->flags); - unlock_page(backpage); - } - - /* it'll reappear on the todo list */ - _leave(" = -EINPROGRESS"); - return -EINPROGRESS; - -unlock_discard: - unlock_page(backpage); -discard: - spin_lock_irq(&object->work_lock); - list_del(&monitor->op_link); - spin_unlock_irq(&object->work_lock); - _leave(" = %d", ret); - return ret; -} - -/* - * copy data from backing pages to netfs pages to complete a read operation - * - driven by FS-Cache's thread pool - */ -static void cachefiles_read_copier(struct fscache_operation *_op) -{ - struct cachefiles_one_read *monitor; - struct cachefiles_object *object; - struct fscache_retrieval *op; - int error, max; - - op = container_of(_op, struct fscache_retrieval, op); - object = container_of(op->op.object, - struct cachefiles_object, fscache); - - _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino); - - max = 8; - spin_lock_irq(&object->work_lock); - - while (!list_empty(&op->to_do)) { - monitor = list_entry(op->to_do.next, - struct cachefiles_one_read, op_link); - list_del(&monitor->op_link); - - spin_unlock_irq(&object->work_lock); - - _debug("- copy {%lu}", monitor->back_page->index); - - recheck: - if (test_bit(FSCACHE_COOKIE_INVALIDATING, - &object->fscache.cookie->flags)) { - error = -ESTALE; - } else if (PageUptodate(monitor->back_page)) { - copy_highpage(monitor->netfs_page, monitor->back_page); - fscache_mark_page_cached(monitor->op, - monitor->netfs_page); - error = 0; - } else if (!PageError(monitor->back_page)) { - /* the page has probably been truncated */ - error = cachefiles_read_reissue(object, monitor); - if (error == -EINPROGRESS) - goto next; - goto recheck; - } else { - cachefiles_io_error_obj( - object, - "Readpage failed on backing file %lx", - (unsigned long) monitor->back_page->flags); - error = -EIO; - } - - put_page(monitor->back_page); - - fscache_end_io(op, monitor->netfs_page, error); - put_page(monitor->netfs_page); - fscache_retrieval_complete(op, 1); - fscache_put_retrieval(op); - kfree(monitor); - - next: - /* let the thread pool have some air occasionally */ - max--; - if (max < 0 || need_resched()) { - if (!list_empty(&op->to_do)) - fscache_enqueue_retrieval(op); - _leave(" [maxed out]"); - return; - } - - spin_lock_irq(&object->work_lock); - } - - spin_unlock_irq(&object->work_lock); - _leave(""); -} - -/* - * read the corresponding page to the given set from the backing file - * - an uncertain page is simply discarded, to be tried again another time - */ -static int cachefiles_read_backing_file_one(struct cachefiles_object *object, - struct fscache_retrieval *op, - struct page *netpage) -{ - struct cachefiles_one_read *monitor; - struct address_space *bmapping; - struct page *newpage, *backpage; - int ret; - - _enter(""); - - _debug("read back %p{%lu,%d}", - netpage, netpage->index, page_count(netpage)); - - monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); - if (!monitor) - goto nomem; - - monitor->netfs_page = netpage; - monitor->op = fscache_get_retrieval(op); - - init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); - - /* attempt to get hold of the backing page */ - bmapping = d_backing_inode(object->backer)->i_mapping; - newpage = NULL; - - for (;;) { - backpage = find_get_page(bmapping, netpage->index); - if (backpage) - goto backing_page_already_present; - - if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp); - if (!newpage) - goto nomem_monitor; - } - - ret = add_to_page_cache_lru(newpage, bmapping, - netpage->index, cachefiles_gfp); - if (ret == 0) - goto installed_new_backing_page; - if (ret != -EEXIST) - goto nomem_page; - } - - /* we've installed a new backing page, so now we need to start - * it reading */ -installed_new_backing_page: - _debug("- new %p", newpage); - - backpage = newpage; - newpage = NULL; - -read_backing_page: - ret = bmapping->a_ops->readpage(NULL, backpage); - if (ret < 0) - goto read_error; - - /* set the monitor to transfer the data across */ -monitor_backing_page: - _debug("- monitor add"); - - /* install the monitor */ - get_page(monitor->netfs_page); - get_page(backpage); - monitor->back_page = backpage; - monitor->monitor.private = backpage; - folio_add_wait_queue(page_folio(backpage), &monitor->monitor); - monitor = NULL; - - /* but the page may have been read before the monitor was installed, so - * the monitor may miss the event - so we have to ensure that we do get - * one in such a case */ - if (trylock_page(backpage)) { - _debug("jumpstart %p {%lx}", backpage, backpage->flags); - unlock_page(backpage); - } - goto success; - - /* if the backing page is already present, it can be in one of - * three states: read in progress, read failed or read okay */ -backing_page_already_present: - _debug("- present"); - - if (newpage) { - put_page(newpage); - newpage = NULL; - } - - if (PageError(backpage)) - goto io_error; - - if (PageUptodate(backpage)) - goto backing_page_already_uptodate; - - if (!trylock_page(backpage)) - goto monitor_backing_page; - _debug("read %p {%lx}", backpage, backpage->flags); - goto read_backing_page; - - /* the backing page is already up to date, attach the netfs - * page to the pagecache and LRU and copy the data across */ -backing_page_already_uptodate: - _debug("- uptodate"); - - fscache_mark_page_cached(op, netpage); - - copy_highpage(netpage, backpage); - fscache_end_io(op, netpage, 0); - fscache_retrieval_complete(op, 1); - -success: - _debug("success"); - ret = 0; - -out: - if (backpage) - put_page(backpage); - if (monitor) { - fscache_put_retrieval(monitor->op); - kfree(monitor); - } - _leave(" = %d", ret); - return ret; - -read_error: - _debug("read error %d", ret); - if (ret == -ENOMEM) { - fscache_retrieval_complete(op, 1); - goto out; - } -io_error: - cachefiles_io_error_obj(object, "Page read error on backing file"); - fscache_retrieval_complete(op, 1); - ret = -ENOBUFS; - goto out; - -nomem_page: - put_page(newpage); -nomem_monitor: - fscache_put_retrieval(monitor->op); - kfree(monitor); -nomem: - fscache_retrieval_complete(op, 1); - _leave(" = -ENOMEM"); - return -ENOMEM; -} - -/* - * read a page from the cache or allocate a block in which to store it - * - cache withdrawal is prevented by the caller - * - returns -EINTR if interrupted - * - returns -ENOMEM if ran out of memory - * - returns -ENOBUFS if no buffers can be made available - * - returns -ENOBUFS if page is beyond EOF - * - if the page is backed by a block in the cache: - * - a read will be started which will call the callback on completion - * - 0 will be returned - * - else if the page is unbacked: - * - the metadata will be retained - * - -ENODATA will be returned - */ -int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct inode *inode; - sector_t block; - unsigned shift; - int ret, ret2; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("{%p},{%lx},,,", object, page->index); - - if (!object->backer) - goto enobufs; - - inode = d_backing_inode(object->backer); - ASSERT(S_ISREG(inode->i_mode)); - - /* calculate the shift required to use bmap */ - shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - - op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_ASYNC; - op->op.processor = cachefiles_read_copier; - - /* we assume the absence or presence of the first block is a good - * enough indication for the page as a whole - * - TODO: don't use bmap() for this as it is _not_ actually good - * enough for this as it doesn't indicate errors, but it's all we've - * got for the moment - */ - block = page->index; - block <<= shift; - - ret2 = bmap(inode, &block); - ASSERT(ret2 == 0); - - _debug("%llx -> %llx", - (unsigned long long) (page->index << shift), - (unsigned long long) block); - - if (block) { - /* submit the apparently valid page to the backing fs to be - * read from disk */ - ret = cachefiles_read_backing_file_one(object, op, page); - } else if (cachefiles_has_space(cache, 0, 1) == 0) { - /* there's space in the cache we can use */ - fscache_mark_page_cached(op, page); - fscache_retrieval_complete(op, 1); - ret = -ENODATA; - } else { - goto enobufs; - } - - _leave(" = %d", ret); - return ret; - -enobufs: - fscache_retrieval_complete(op, 1); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} - -/* - * read the corresponding pages to the given set from the backing file - * - any uncertain pages are simply discarded, to be tried again another time - */ -static int cachefiles_read_backing_file(struct cachefiles_object *object, - struct fscache_retrieval *op, - struct list_head *list) -{ - struct cachefiles_one_read *monitor = NULL; - struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; - struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; - int ret = 0; - - _enter(""); - - list_for_each_entry_safe(netpage, _n, list, lru) { - list_del(&netpage->lru); - - _debug("read back %p{%lu,%d}", - netpage, netpage->index, page_count(netpage)); - - if (!monitor) { - monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); - if (!monitor) - goto nomem; - - monitor->op = fscache_get_retrieval(op); - init_waitqueue_func_entry(&monitor->monitor, - cachefiles_read_waiter); - } - - for (;;) { - backpage = find_get_page(bmapping, netpage->index); - if (backpage) - goto backing_page_already_present; - - if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp); - if (!newpage) - goto nomem; - } - - ret = add_to_page_cache_lru(newpage, bmapping, - netpage->index, - cachefiles_gfp); - if (ret == 0) - goto installed_new_backing_page; - if (ret != -EEXIST) - goto nomem; - } - - /* we've installed a new backing page, so now we need - * to start it reading */ - installed_new_backing_page: - _debug("- new %p", newpage); - - backpage = newpage; - newpage = NULL; - - reread_backing_page: - ret = bmapping->a_ops->readpage(NULL, backpage); - if (ret < 0) - goto read_error; - - /* add the netfs page to the pagecache and LRU, and set the - * monitor to transfer the data across */ - monitor_backing_page: - _debug("- monitor add"); - - ret = add_to_page_cache_lru(netpage, op->mapping, - netpage->index, cachefiles_gfp); - if (ret < 0) { - if (ret == -EEXIST) { - put_page(backpage); - backpage = NULL; - put_page(netpage); - netpage = NULL; - fscache_retrieval_complete(op, 1); - continue; - } - goto nomem; - } - - /* install a monitor */ - get_page(netpage); - monitor->netfs_page = netpage; - - get_page(backpage); - monitor->back_page = backpage; - monitor->monitor.private = backpage; - folio_add_wait_queue(page_folio(backpage), &monitor->monitor); - monitor = NULL; - - /* but the page may have been read before the monitor was - * installed, so the monitor may miss the event - so we have to - * ensure that we do get one in such a case */ - if (trylock_page(backpage)) { - _debug("2unlock %p {%lx}", backpage, backpage->flags); - unlock_page(backpage); - } - - put_page(backpage); - backpage = NULL; - - put_page(netpage); - netpage = NULL; - continue; - - /* if the backing page is already present, it can be in one of - * three states: read in progress, read failed or read okay */ - backing_page_already_present: - _debug("- present %p", backpage); - - if (PageError(backpage)) - goto io_error; - - if (PageUptodate(backpage)) - goto backing_page_already_uptodate; - - _debug("- not ready %p{%lx}", backpage, backpage->flags); - - if (!trylock_page(backpage)) - goto monitor_backing_page; - - if (PageError(backpage)) { - _debug("error %lx", backpage->flags); - unlock_page(backpage); - goto io_error; - } - - if (PageUptodate(backpage)) - goto backing_page_already_uptodate_unlock; - - /* we've locked a page that's neither up to date nor erroneous, - * so we need to attempt to read it again */ - goto reread_backing_page; - - /* the backing page is already up to date, attach the netfs - * page to the pagecache and LRU and copy the data across */ - backing_page_already_uptodate_unlock: - _debug("uptodate %lx", backpage->flags); - unlock_page(backpage); - backing_page_already_uptodate: - _debug("- uptodate"); - - ret = add_to_page_cache_lru(netpage, op->mapping, - netpage->index, cachefiles_gfp); - if (ret < 0) { - if (ret == -EEXIST) { - put_page(backpage); - backpage = NULL; - put_page(netpage); - netpage = NULL; - fscache_retrieval_complete(op, 1); - continue; - } - goto nomem; - } - - copy_highpage(netpage, backpage); - - put_page(backpage); - backpage = NULL; - - fscache_mark_page_cached(op, netpage); - - /* the netpage is unlocked and marked up to date here */ - fscache_end_io(op, netpage, 0); - put_page(netpage); - netpage = NULL; - fscache_retrieval_complete(op, 1); - continue; - } - - netpage = NULL; - - _debug("out"); - -out: - /* tidy up */ - if (newpage) - put_page(newpage); - if (netpage) - put_page(netpage); - if (backpage) - put_page(backpage); - if (monitor) { - fscache_put_retrieval(op); - kfree(monitor); - } - - list_for_each_entry_safe(netpage, _n, list, lru) { - list_del(&netpage->lru); - put_page(netpage); - fscache_retrieval_complete(op, 1); - } - - _leave(" = %d", ret); - return ret; - -nomem: - _debug("nomem"); - ret = -ENOMEM; - goto record_page_complete; - -read_error: - _debug("read error %d", ret); - if (ret == -ENOMEM) - goto record_page_complete; -io_error: - cachefiles_io_error_obj(object, "Page read error on backing file"); - ret = -ENOBUFS; -record_page_complete: - fscache_retrieval_complete(op, 1); - goto out; -} - -/* - * read a list of pages from the cache or allocate blocks in which to store - * them - */ -int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct list_head backpages; - struct pagevec pagevec; - struct inode *inode; - struct page *page, *_n; - unsigned shift, nrbackpages; - int ret, ret2, space; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("{OBJ%x,%d},,%d,,", - object->fscache.debug_id, atomic_read(&op->op.usage), - *nr_pages); - - if (!object->backer) - goto all_enobufs; - - space = 1; - if (cachefiles_has_space(cache, 0, *nr_pages) < 0) - space = 0; - - inode = d_backing_inode(object->backer); - ASSERT(S_ISREG(inode->i_mode)); - - /* calculate the shift required to use bmap */ - shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - - pagevec_init(&pagevec); - - op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_ASYNC; - op->op.processor = cachefiles_read_copier; - - INIT_LIST_HEAD(&backpages); - nrbackpages = 0; - - ret = space ? -ENODATA : -ENOBUFS; - list_for_each_entry_safe(page, _n, pages, lru) { - sector_t block; - - /* we assume the absence or presence of the first block is a - * good enough indication for the page as a whole - * - TODO: don't use bmap() for this as it is _not_ actually - * good enough for this as it doesn't indicate errors, but - * it's all we've got for the moment - */ - block = page->index; - block <<= shift; - - ret2 = bmap(inode, &block); - ASSERT(ret2 == 0); - - _debug("%llx -> %llx", - (unsigned long long) (page->index << shift), - (unsigned long long) block); - - if (block) { - /* we have data - add it to the list to give to the - * backing fs */ - list_move(&page->lru, &backpages); - (*nr_pages)--; - nrbackpages++; - } else if (space && pagevec_add(&pagevec, page) == 0) { - fscache_mark_pages_cached(op, &pagevec); - fscache_retrieval_complete(op, 1); - ret = -ENODATA; - } else { - fscache_retrieval_complete(op, 1); - } - } - - if (pagevec_count(&pagevec) > 0) - fscache_mark_pages_cached(op, &pagevec); - - if (list_empty(pages)) - ret = 0; - - /* submit the apparently valid pages to the backing fs to be read from - * disk */ - if (nrbackpages > 0) { - ret2 = cachefiles_read_backing_file(object, op, &backpages); - if (ret2 == -ENOMEM || ret2 == -EINTR) - ret = ret2; - } - - _leave(" = %d [nr=%u%s]", - ret, *nr_pages, list_empty(pages) ? " empty" : ""); - return ret; - -all_enobufs: - fscache_retrieval_complete(op, *nr_pages); - return -ENOBUFS; -} - -/* - * allocate a block in the cache in which to store a page - * - cache withdrawal is prevented by the caller - * - returns -EINTR if interrupted - * - returns -ENOMEM if ran out of memory - * - returns -ENOBUFS if no buffers can be made available - * - returns -ENOBUFS if page is beyond EOF - * - otherwise: - * - the metadata will be retained - * - 0 will be returned - */ -int cachefiles_allocate_page(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - int ret; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("%p,{%lx},", object, page->index); - - ret = cachefiles_has_space(cache, 0, 1); - if (ret == 0) - fscache_mark_page_cached(op, page); - else - ret = -ENOBUFS; - - fscache_retrieval_complete(op, 1); - _leave(" = %d", ret); - return ret; -} - -/* - * allocate blocks in the cache in which to store a set of pages - * - cache withdrawal is prevented by the caller - * - returns -EINTR if interrupted - * - returns -ENOMEM if ran out of memory - * - returns -ENOBUFS if some buffers couldn't be made available - * - returns -ENOBUFS if some pages are beyond EOF - * - otherwise: - * - -ENODATA will be returned - * - metadata will be retained for any page marked - */ -int cachefiles_allocate_pages(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct pagevec pagevec; - struct page *page; - int ret; - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - _enter("%p,,,%d,", object, *nr_pages); - - ret = cachefiles_has_space(cache, 0, *nr_pages); - if (ret == 0) { - pagevec_init(&pagevec); - - list_for_each_entry(page, pages, lru) { - if (pagevec_add(&pagevec, page) == 0) - fscache_mark_pages_cached(op, &pagevec); - } - - if (pagevec_count(&pagevec) > 0) - fscache_mark_pages_cached(op, &pagevec); - ret = -ENODATA; - } else { - ret = -ENOBUFS; - } - - fscache_retrieval_complete(op, *nr_pages); - _leave(" = %d", ret); - return ret; -} - -/* - * request a page be stored in the cache - * - cache withdrawal is prevented by the caller - * - this request may be ignored if there's no cache block available, in which - * case -ENOBUFS will be returned - * - if the op is in progress, 0 will be returned - */ -int cachefiles_write_page(struct fscache_storage *op, struct page *page) -{ - struct cachefiles_object *object; - struct cachefiles_cache *cache; - struct file *file; - struct path path; - loff_t pos, eof; - size_t len; - void *data; - int ret = -ENOBUFS; - - ASSERT(op != NULL); - ASSERT(page != NULL); - - object = container_of(op->op.object, - struct cachefiles_object, fscache); - - _enter("%p,%p{%lx},,,", object, page, page->index); - - if (!object->backer) { - _leave(" = -ENOBUFS"); - return -ENOBUFS; - } - - ASSERT(d_is_reg(object->backer)); - - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - pos = (loff_t)page->index << PAGE_SHIFT; - - /* We mustn't write more data than we have, so we have to beware of a - * partial page at EOF. - */ - eof = object->fscache.store_limit_l; - if (pos >= eof) - goto error; - - /* write the page to the backing filesystem and let it store it in its - * own time */ - path.mnt = cache->mnt; - path.dentry = object->backer; - file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto error_2; - } - - len = PAGE_SIZE; - if (eof & ~PAGE_MASK) { - if (eof - pos < PAGE_SIZE) { - _debug("cut short %llx to %llx", - pos, eof); - len = eof - pos; - ASSERTCMP(pos + len, ==, eof); - } - } - - data = kmap(page); - ret = kernel_write(file, data, len, &pos); - kunmap(page); - fput(file); - if (ret != len) - goto error_eio; - - _leave(" = 0"); - return 0; - -error_eio: - ret = -EIO; -error_2: - if (ret == -EIO) - cachefiles_io_error_obj(object, - "Write page to backing file failed"); -error: - _leave(" = -ENOBUFS [%d]", ret); - return -ENOBUFS; -} - -/* - * detach a backing block from a page - * - cache withdrawal is prevented by the caller - */ -void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) - __releases(&object->fscache.cookie->lock) -{ - struct cachefiles_object *object; - - object = container_of(_object, struct cachefiles_object, fscache); - - _enter("%p,{%lu}", object, page->index); - - spin_unlock(&object->fscache.cookie->lock); -} diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index aec13fd94692..fe777164f1d8 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles security management * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c new file mode 100644 index 000000000000..89df0ba8ba5e --- /dev/null +++ b/fs/cachefiles/volume.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Volume handling. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include "internal.h" +#include <trace/events/fscache.h> + +/* + * Allocate and set up a volume representation. We make sure all the fanout + * directories are created and pinned. + */ +void cachefiles_acquire_volume(struct fscache_volume *vcookie) +{ + struct cachefiles_volume *volume; + struct cachefiles_cache *cache = vcookie->cache->cache_priv; + const struct cred *saved_cred; + struct dentry *vdentry, *fan; + size_t len; + char *name; + bool is_new = false; + int ret, n_accesses, i; + + _enter(""); + + volume = kzalloc(sizeof(struct cachefiles_volume), GFP_KERNEL); + if (!volume) + return; + volume->vcookie = vcookie; + volume->cache = cache; + INIT_LIST_HEAD(&volume->cache_link); + + cachefiles_begin_secure(cache, &saved_cred); + + len = vcookie->key[0]; + name = kmalloc(len + 3, GFP_NOFS); + if (!name) + goto error_vol; + name[0] = 'I'; + memcpy(name + 1, vcookie->key + 1, len); + name[len + 1] = 0; + +retry: + vdentry = cachefiles_get_directory(cache, cache->store, name, &is_new); + if (IS_ERR(vdentry)) + goto error_name; + volume->dentry = vdentry; + + if (is_new) { + if (!cachefiles_set_volume_xattr(volume)) + goto error_dir; + } else { + ret = cachefiles_check_volume_xattr(volume); + if (ret < 0) { + if (ret != -ESTALE) + goto error_dir; + inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT); + cachefiles_bury_object(cache, NULL, cache->store, vdentry, + FSCACHE_VOLUME_IS_WEIRD); + cachefiles_put_directory(volume->dentry); + cond_resched(); + goto retry; + } + } + + for (i = 0; i < 256; i++) { + sprintf(name, "@%02x", i); + fan = cachefiles_get_directory(cache, vdentry, name, NULL); + if (IS_ERR(fan)) + goto error_fan; + volume->fanout[i] = fan; + } + + cachefiles_end_secure(cache, saved_cred); + + vcookie->cache_priv = volume; + n_accesses = atomic_inc_return(&vcookie->n_accesses); /* Stop wakeups on dec-to-0 */ + trace_fscache_access_volume(vcookie->debug_id, 0, + refcount_read(&vcookie->ref), + n_accesses, fscache_access_cache_pin); + + spin_lock(&cache->object_list_lock); + list_add(&volume->cache_link, &volume->cache->volumes); + spin_unlock(&cache->object_list_lock); + + kfree(name); + return; + +error_fan: + for (i = 0; i < 256; i++) + cachefiles_put_directory(volume->fanout[i]); +error_dir: + cachefiles_put_directory(volume->dentry); +error_name: + kfree(name); +error_vol: + kfree(volume); + cachefiles_end_secure(cache, saved_cred); +} + +/* + * Release a volume representation. + */ +static void __cachefiles_free_volume(struct cachefiles_volume *volume) +{ + int i; + + _enter(""); + + volume->vcookie->cache_priv = NULL; + + for (i = 0; i < 256; i++) + cachefiles_put_directory(volume->fanout[i]); + cachefiles_put_directory(volume->dentry); + kfree(volume); +} + +void cachefiles_free_volume(struct fscache_volume *vcookie) +{ + struct cachefiles_volume *volume = vcookie->cache_priv; + + if (volume) { + spin_lock(&volume->cache->object_list_lock); + list_del_init(&volume->cache_link); + spin_unlock(&volume->cache->object_list_lock); + __cachefiles_free_volume(volume); + } +} + +void cachefiles_withdraw_volume(struct cachefiles_volume *volume) +{ + fscache_withdraw_volume(volume->vcookie); + cachefiles_set_volume_xattr(volume); + __cachefiles_free_volume(volume); +} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 9e82de668595..83f41bd0c3a9 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* CacheFiles extended attribute management * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -15,310 +15,245 @@ #include <linux/slab.h> #include "internal.h" +#define CACHEFILES_COOKIE_TYPE_DATA 1 + +struct cachefiles_xattr { + __be64 object_size; /* Actual size of the object */ + __be64 zero_point; /* Size after which server has no data not written by us */ + __u8 type; /* Type of object */ + __u8 content; /* Content presence (enum cachefiles_content) */ + __u8 data[]; /* netfs coherency data */ +} __packed; + static const char cachefiles_xattr_cache[] = XATTR_USER_PREFIX "CacheFiles.cache"; /* - * check the type label on an object - * - done using xattrs + * set the state xattr on a cache file */ -int cachefiles_check_object_type(struct cachefiles_object *object) +int cachefiles_set_object_xattr(struct cachefiles_object *object) { - struct dentry *dentry = object->dentry; - char type[3], xtype[3]; + struct cachefiles_xattr *buf; + struct dentry *dentry; + struct file *file = object->file; + unsigned int len = object->cookie->aux_len; int ret; - ASSERT(dentry); - ASSERT(d_backing_inode(dentry)); - - if (!object->fscache.cookie) - strcpy(type, "C3"); - else - snprintf(type, 3, "%02x", object->fscache.cookie->def->type); - - _enter("%x{%s}", object->fscache.debug_id, type); + if (!file) + return -ESTALE; + dentry = file->f_path.dentry; - /* attempt to install a type label directly */ - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type, - 2, XATTR_CREATE); - if (ret == 0) { - _debug("SET"); /* we succeeded */ - goto error; - } + _enter("%x,#%d", object->debug_id, len); - if (ret != -EEXIST) { - pr_err("Can't set xattr on %pd [%lu] (err %d)\n", - dentry, d_backing_inode(dentry)->i_ino, - -ret); - goto error; - } + buf = kmalloc(sizeof(struct cachefiles_xattr) + len, GFP_KERNEL); + if (!buf) + return -ENOMEM; - /* read the current type label */ - ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype, - 3); + buf->object_size = cpu_to_be64(object->cookie->object_size); + buf->zero_point = 0; + buf->type = CACHEFILES_COOKIE_TYPE_DATA; + buf->content = object->content_info; + if (test_bit(FSCACHE_COOKIE_LOCAL_WRITE, &object->cookie->flags)) + buf->content = CACHEFILES_CONTENT_DIRTY; + if (len > 0) + memcpy(buf->data, fscache_get_aux(object->cookie), len); + + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + buf, sizeof(struct cachefiles_xattr) + len, 0); if (ret < 0) { - if (ret == -ERANGE) - goto bad_type_length; - - pr_err("Can't read xattr on %pd [%lu] (err %d)\n", - dentry, d_backing_inode(dentry)->i_ino, - -ret); - goto error; + trace_cachefiles_vfs_error(object, file_inode(file), ret, + cachefiles_trace_setxattr_error); + trace_cachefiles_coherency(object, file_inode(file)->i_ino, + buf->content, + cachefiles_coherency_set_fail); + if (ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to set xattr with error %d", ret); + } else { + trace_cachefiles_coherency(object, file_inode(file)->i_ino, + buf->content, + cachefiles_coherency_set_ok); } - /* check the type is what we're expecting */ - if (ret != 2) - goto bad_type_length; - - if (xtype[0] != type[0] || xtype[1] != type[1]) - goto bad_type; - - ret = 0; - -error: + kfree(buf); _leave(" = %d", ret); return ret; - -bad_type_length: - pr_err("Cache object %lu type xattr length incorrect\n", - d_backing_inode(dentry)->i_ino); - ret = -EIO; - goto error; - -bad_type: - xtype[2] = 0; - pr_err("Cache object %pd [%lu] type %s not %s\n", - dentry, d_backing_inode(dentry)->i_ino, - xtype, type); - ret = -EIO; - goto error; } /* - * set the state xattr on a cache file + * check the consistency between the backing cache and the FS-Cache cookie */ -int cachefiles_set_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata) +int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file) { - struct dentry *dentry = object->dentry; - int ret; - - ASSERT(dentry); - - _enter("%p,#%d", object, auxdata->len); + struct cachefiles_xattr *buf; + struct dentry *dentry = file->f_path.dentry; + unsigned int len = object->cookie->aux_len, tlen; + const void *p = fscache_get_aux(object->cookie); + enum cachefiles_coherency_trace why; + ssize_t xlen; + int ret = -ESTALE; - /* attempt to install the cache metadata directly */ - _debug("SET #%u", auxdata->len); + tlen = sizeof(struct cachefiles_xattr) + len; + buf = kmalloc(tlen, GFP_KERNEL); + if (!buf) + return -ENOMEM; - clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, XATTR_CREATE); - if (ret < 0 && ret != -ENOMEM) - cachefiles_io_error_obj( - object, - "Failed to set xattr with error %d", ret); + xlen = cachefiles_inject_read_error(); + if (xlen == 0) + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen); + if (xlen != tlen) { + if (xlen < 0) + trace_cachefiles_vfs_error(object, file_inode(file), xlen, + cachefiles_trace_getxattr_error); + if (xlen == -EIO) + cachefiles_io_error_obj( + object, + "Failed to read aux with error %zd", xlen); + why = cachefiles_coherency_check_xattr; + } else if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) { + why = cachefiles_coherency_check_type; + } else if (memcmp(buf->data, p, len) != 0) { + why = cachefiles_coherency_check_aux; + } else if (be64_to_cpu(buf->object_size) != object->cookie->object_size) { + why = cachefiles_coherency_check_objsize; + } else if (buf->content == CACHEFILES_CONTENT_DIRTY) { + // TODO: Begin conflict resolution + pr_warn("Dirty object in cache\n"); + why = cachefiles_coherency_check_dirty; + } else { + why = cachefiles_coherency_check_ok; + ret = 0; + } - _leave(" = %d", ret); + trace_cachefiles_coherency(object, file_inode(file)->i_ino, + buf->content, why); + kfree(buf); return ret; } /* - * update the state xattr on a cache file + * remove the object's xattr to mark it stale */ -int cachefiles_update_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata) +int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct cachefiles_object *object, + struct dentry *dentry) { - struct dentry *dentry = object->dentry; int ret; - if (!dentry) - return -ESTALE; - - _enter("%x,#%d", object->fscache.debug_id, auxdata->len); - - /* attempt to install the cache metadata directly */ - _debug("SET #%u", auxdata->len); - - clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, XATTR_REPLACE); - if (ret < 0 && ret != -ENOMEM) - cachefiles_io_error_obj( - object, - "Failed to update xattr with error %d", ret); + ret = cachefiles_inject_remove_error(); + if (ret == 0) + ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); + if (ret < 0) { + trace_cachefiles_vfs_error(object, d_inode(dentry), ret, + cachefiles_trace_remxattr_error); + if (ret == -ENOENT || ret == -ENODATA) + ret = 0; + else if (ret != -ENOMEM) + cachefiles_io_error(cache, + "Can't remove xattr from %lu" + " (error %d)", + d_backing_inode(dentry)->i_ino, -ret); + } _leave(" = %d", ret); return ret; } /* - * check the consistency between the backing cache and the FS-Cache cookie + * Stick a marker on the cache object to indicate that it's dirty. */ -int cachefiles_check_auxdata(struct cachefiles_object *object) +void cachefiles_prepare_to_write(struct fscache_cookie *cookie) { - struct cachefiles_xattr *auxbuf; - enum fscache_checkaux validity; - struct dentry *dentry = object->dentry; - ssize_t xlen; - int ret; - - ASSERT(dentry); - ASSERT(d_backing_inode(dentry)); - ASSERT(object->fscache.cookie->def->check_aux); - - auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); - if (!auxbuf) - return -ENOMEM; + const struct cred *saved_cred; + struct cachefiles_object *object = cookie->cache_priv; + struct cachefiles_cache *cache = object->volume->cache; - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxbuf->type, 512 + 1); - ret = -ESTALE; - if (xlen < 1 || - auxbuf->type != object->fscache.cookie->def->type) - goto error; + _enter("c=%08x", object->cookie->debug_id); - xlen--; - validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen, - i_size_read(d_backing_inode(dentry))); - if (validity != FSCACHE_CHECKAUX_OKAY) - goto error; - - ret = 0; -error: - kfree(auxbuf); - return ret; + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_set_object_xattr(object); + cachefiles_end_secure(cache, saved_cred); + } } /* - * check the state xattr on a cache file - * - return -ESTALE if the object should be deleted + * Set the state xattr on a volume directory. */ -int cachefiles_check_object_xattr(struct cachefiles_object *object, - struct cachefiles_xattr *auxdata) +bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) { - struct cachefiles_xattr *auxbuf; - struct dentry *dentry = object->dentry; + unsigned int len = volume->vcookie->coherency_len; + const void *p = volume->vcookie->coherency; + struct dentry *dentry = volume->dentry; int ret; - _enter("%p,#%d", object, auxdata->len); - - ASSERT(dentry); - ASSERT(d_backing_inode(dentry)); - - auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); - if (!auxbuf) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } + _enter("%x,#%d", volume->vcookie->debug_id, len); - /* read the current type label */ - ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - &auxbuf->type, 512 + 1); + ret = cachefiles_inject_write_error(); + if (ret == 0) + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + p, len, 0); if (ret < 0) { - if (ret == -ENODATA) - goto stale; /* no attribute - power went off - * mid-cull? */ - - if (ret == -ERANGE) - goto bad_type_length; - - cachefiles_io_error_obj(object, - "Can't read xattr on %lu (err %d)", - d_backing_inode(dentry)->i_ino, -ret); - goto error; + trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, + cachefiles_trace_setxattr_error); + trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, + cachefiles_coherency_vol_set_fail); + if (ret != -ENOMEM) + cachefiles_io_error( + volume->cache, "Failed to set xattr with error %d", ret); + } else { + trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, + cachefiles_coherency_vol_set_ok); } - /* check the on-disk object */ - if (ret < 1) - goto bad_type_length; - - if (auxbuf->type != auxdata->type) - goto stale; - - auxbuf->len = ret; - - /* consult the netfs */ - if (object->fscache.cookie->def->check_aux) { - enum fscache_checkaux result; - unsigned int dlen; - - dlen = auxbuf->len - 1; - - _debug("checkaux %s #%u", - object->fscache.cookie->def->name, dlen); - - result = fscache_check_aux(&object->fscache, - &auxbuf->data, dlen, - i_size_read(d_backing_inode(dentry))); - - switch (result) { - /* entry okay as is */ - case FSCACHE_CHECKAUX_OKAY: - goto okay; - - /* entry requires update */ - case FSCACHE_CHECKAUX_NEEDS_UPDATE: - break; - - /* entry requires deletion */ - case FSCACHE_CHECKAUX_OBSOLETE: - goto stale; - - default: - BUG(); - } - - /* update the current label */ - ret = vfs_setxattr(&init_user_ns, dentry, - cachefiles_xattr_cache, &auxdata->type, - auxdata->len, XATTR_REPLACE); - if (ret < 0) { - cachefiles_io_error_obj(object, - "Can't update xattr on %lu" - " (error %d)", - d_backing_inode(dentry)->i_ino, -ret); - goto error; - } - } - -okay: - ret = 0; - -error: - kfree(auxbuf); _leave(" = %d", ret); - return ret; - -bad_type_length: - pr_err("Cache object %lu xattr length incorrect\n", - d_backing_inode(dentry)->i_ino); - ret = -EIO; - goto error; - -stale: - ret = -ESTALE; - goto error; + return ret == 0; } /* - * remove the object's xattr to mark it stale + * Check the consistency between the backing cache and the volume cookie. */ -int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, - struct dentry *dentry) +int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) { - int ret; + struct cachefiles_xattr *buf; + struct dentry *dentry = volume->dentry; + unsigned int len = volume->vcookie->coherency_len; + const void *p = volume->vcookie->coherency; + enum cachefiles_coherency_trace why; + ssize_t xlen; + int ret = -ESTALE; - ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); - if (ret < 0) { - if (ret == -ENOENT || ret == -ENODATA) - ret = 0; - else if (ret != -ENOMEM) - cachefiles_io_error(cache, - "Can't remove xattr from %lu" - " (error %d)", - d_backing_inode(dentry)->i_ino, -ret); + _enter(""); + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + xlen = cachefiles_inject_read_error(); + if (xlen == 0) + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len); + if (xlen != len) { + if (xlen < 0) { + trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, + cachefiles_trace_getxattr_error); + if (xlen == -EIO) + cachefiles_io_error( + volume->cache, + "Failed to read xattr with error %zd", xlen); + } + why = cachefiles_coherency_vol_check_xattr; + } else if (memcmp(buf->data, p, len) != 0) { + why = cachefiles_coherency_vol_check_cmp; + } else { + why = cachefiles_coherency_vol_check_ok; + ret = 0; } + trace_cachefiles_vol_coherency(volume, d_inode(dentry)->i_ino, why); + kfree(buf); _leave(" = %d", ret); return ret; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e53c8541f5b2..c98e5238a1b6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -4,8 +4,8 @@ #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/writeback.h> /* generic_writepages */ #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> @@ -126,7 +126,7 @@ static int ceph_set_page_dirty(struct page *page) BUG_ON(PagePrivate(page)); attach_page_private(page, snapc); - return __set_page_dirty_nobuffers(page); + return ceph_fscache_set_page_dirty(page); } /* @@ -141,8 +141,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - wait_on_page_fscache(page); - inode = page->mapping->host; ci = ceph_inode(inode); @@ -153,28 +151,36 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, } WARN_ON(!PageLocked(page)); - if (!PagePrivate(page)) - return; + if (PagePrivate(page)) { + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); + snapc = detach_page_private(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + } - snapc = detach_page_private(page); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); + wait_on_page_fscache(page); } static int ceph_releasepage(struct page *page, gfp_t gfp) { - dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, - page, page->index, PageDirty(page) ? "" : "not "); + struct inode *inode = page->mapping->host; + + dout("%llx:%llx releasepage %p idx %lu (%sdirty)\n", + ceph_vinop(inode), page, + page->index, PageDirty(page) ? "" : "not "); + + if (PagePrivate(page)) + return 0; if (PageFsCache(page)) { - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return 0; wait_on_page_fscache(page); } - return !PagePrivate(page); + ceph_fscache_note_page_release(inode); + return 1; } static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) @@ -291,10 +297,6 @@ out: dout("%s: result %d\n", __func__, err); } -static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) -{ -} - static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) { struct inode *inode = mapping->host; @@ -306,7 +308,6 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) } static const struct netfs_read_request_ops ceph_netfs_read_ops = { - .init_rreq = ceph_init_rreq, .is_cache_enabled = ceph_is_cache_enabled, .begin_cache_operation = ceph_begin_cache_operation, .issue_op = ceph_netfs_issue_op, @@ -378,6 +379,38 @@ static void ceph_readahead(struct readahead_control *ractl) netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); } +#ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) +{ + set_page_fscache(page); +} + +static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) +{ + struct inode *inode = priv; + + if (IS_ERR_VALUE(error) && error != -ENOBUFS) + ceph_fscache_invalidate(inode, false); +} + +static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), + ceph_fscache_write_terminated, inode, caching); +} +#else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + +static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ +} +#endif /* CONFIG_CEPH_FSCACHE */ + struct ceph_writeback_ctl { loff_t i_size; @@ -493,6 +526,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; + bool caching = ceph_is_cache_enabled(inode); dout("writepage %p idx %lu\n", page, page->index); @@ -531,16 +565,17 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - set_page_writeback(page); req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); - if (IS_ERR(req)) { - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + if (IS_ERR(req)) return PTR_ERR(req); - } + + set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); + ceph_fscache_write_to_cache(inode, page_off, len, caching); /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); @@ -599,6 +634,9 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; BUG_ON(!inode); ihold(inode); + + wait_on_page_fscache(page); + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -720,6 +758,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_writeback_ctl ceph_wbc; bool should_loop, range_whole = false; bool done = false; + bool caching = ceph_is_cache_enabled(inode); dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : @@ -843,7 +882,7 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || PageFsCache(page)) { if (wbc->sync_mode == WB_SYNC_NONE) { dout("%p under writeback\n", page); unlock_page(page); @@ -851,6 +890,7 @@ get_more_pages: } dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); + wait_on_page_fscache(page); } if (!clear_page_dirty_for_io(page)) { @@ -983,9 +1023,19 @@ new_request: op_idx = 0; for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ if (offset + len != cur_offset) { + /* If it's full, stop here */ if (op_idx + 1 == req->r_num_ops) break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); dout("writepages got pages at %llu~%llu\n", @@ -996,14 +1046,17 @@ new_request: osd_req_op_extent_update(req, op_idx, len); len = 0; - offset = cur_offset; + offset = cur_offset; data_pages = pages + i; op_idx++; } set_page_writeback(pages[i]); + if (caching) + ceph_set_page_fscache(pages[i]); len += thp_size(page); } + ceph_fscache_write_to_cache(inode, offset, len, caching); if (ceph_wbc.size_stable) { len = min(len, ceph_wbc.i_size - offset); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 457afda5498a..7d22850623ef 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -12,199 +12,99 @@ #include "super.h" #include "cache.h" -struct fscache_netfs ceph_cache_netfs = { - .name = "ceph", - .version = 0, -}; - -static DEFINE_MUTEX(ceph_fscache_lock); -static LIST_HEAD(ceph_fscache_list); - -struct ceph_fscache_entry { - struct list_head list; - struct fscache_cookie *fscache; - size_t uniq_len; - /* The following members must be last */ - struct ceph_fsid fsid; - char uniquifier[]; -}; - -static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { - .name = "CEPH.fsid", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -int __init ceph_fscache_register(void) -{ - return fscache_register_netfs(&ceph_cache_netfs); -} - -void ceph_fscache_unregister(void) -{ - fscache_unregister_netfs(&ceph_cache_netfs); -} - -int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - const struct ceph_fsid *fsid = &fsc->client->fsid; - const char *fscache_uniq = fsc->mount_options->fscache_uniq; - size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; - struct ceph_fscache_entry *ent; - int err = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - mutex_lock(&ceph_fscache_lock); - list_for_each_entry(ent, &ceph_fscache_list, list) { - if (memcmp(&ent->fsid, fsid, sizeof(*fsid))) - continue; - if (ent->uniq_len != uniq_len) - continue; - if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) - continue; - - errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", - fsid); - err = -EBUSY; - goto out_unlock; - } + /* No caching for filesystem? */ + if (!fsc->fscache) + return; - ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL); - if (!ent) { - err = -ENOMEM; - goto out_unlock; - } + /* Regular files only */ + if (!S_ISREG(inode->i_mode)) + return; - memcpy(&ent->fsid, fsid, sizeof(*fsid)); - if (uniq_len > 0) { - memcpy(&ent->uniquifier, fscache_uniq, uniq_len); - ent->uniq_len = uniq_len; - } + /* Only new inodes! */ + if (!(inode->i_state & I_NEW)) + return; - fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, - &ceph_fscache_fsid_object_def, - &ent->fsid, sizeof(ent->fsid) + uniq_len, - NULL, 0, - fsc, 0, true); + WARN_ON_ONCE(ci->fscache); - if (fsc->fscache) { - ent->fscache = fsc->fscache; - list_add_tail(&ent->list, &ceph_fscache_list); - } else { - kfree(ent); - errorfc(fc, "unable to register fscache cookie for fsid %pU", - fsid); - /* all other fs ignore this error */ - } -out_unlock: - mutex_unlock(&ceph_fscache_lock); - return err; + ci->fscache = fscache_acquire_cookie(fsc->fscache, 0, + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); } -static enum fscache_checkaux ceph_fscache_inode_check_aux( - void *cookie_netfs_data, const void *data, uint16_t dlen, - loff_t object_size) +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) { - struct ceph_inode_info* ci = cookie_netfs_data; - struct inode* inode = &ci->vfs_inode; + struct fscache_cookie *cookie = ci->fscache; - if (dlen != sizeof(ci->i_version) || - i_size_read(inode) != object_size) - return FSCACHE_CHECKAUX_OBSOLETE; + fscache_relinquish_cookie(cookie, false); +} - if (*(u64 *)data != ci->i_version) - return FSCACHE_CHECKAUX_OBSOLETE; +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) +{ + struct ceph_inode_info *ci = ceph_inode(inode); - dout("ceph inode 0x%p cached okay\n", ci); - return FSCACHE_CHECKAUX_OKAY; + fscache_use_cookie(ci->fscache, will_modify); } -static const struct fscache_cookie_def ceph_fscache_inode_object_def = { - .name = "CEPH.inode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = ceph_fscache_inode_check_aux, -}; - -void ceph_fscache_register_inode_cookie(struct inode *inode) +void ceph_fscache_unuse_cookie(struct inode *inode, bool update) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - - /* No caching for filesystem */ - if (!fsc->fscache) - return; - /* Only cache for regular files that are read only */ - if (!S_ISREG(inode->i_mode)) - return; + if (update) { + loff_t i_size = i_size_read(inode); - inode_lock_nested(inode, I_MUTEX_CHILD); - if (!ci->fscache) { - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - &ci->i_vino, sizeof(ci->i_vino), - &ci->i_version, sizeof(ci->i_version), - ci, i_size_read(inode), false); + fscache_unuse_cookie(ci->fscache, &ci->i_version, &i_size); + } else { + fscache_unuse_cookie(ci->fscache, NULL, NULL); } - inode_unlock(inode); } -void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +void ceph_fscache_update(struct inode *inode) { - struct fscache_cookie* cookie; - - if ((cookie = ci->fscache) == NULL) - return; - - ci->fscache = NULL; + struct ceph_inode_info *ci = ceph_inode(inode); + loff_t i_size = i_size_read(inode); - fscache_relinquish_cookie(cookie, &ci->i_vino, false); + fscache_update_cookie(ci->fscache, &ci->i_version, &i_size); } -static bool ceph_fscache_can_enable(void *data) +void ceph_fscache_invalidate(struct inode *inode, bool dio_write) { - struct inode *inode = data; - return !inode_is_open_for_write(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_invalidate(ceph_inode(inode)->fscache, + &ci->i_version, i_size_read(inode), + dio_write ? FSCACHE_INVAL_DIO_WRITE : 0); } -void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { - struct ceph_inode_info *ci = ceph_inode(inode); + const struct ceph_fsid *fsid = &fsc->client->fsid; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + char *name; + int err = 0; - if (!fscache_cookie_valid(ci->fscache)) - return; + name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "", + uniq_len ? fscache_uniq : ""); + if (!name) + return -ENOMEM; - if (inode_is_open_for_write(inode)) { - dout("fscache_file_set_cookie %p %p disabling cache\n", - inode, filp); - fscache_disable_cookie(ci->fscache, &ci->i_vino, false); - } else { - fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), - ceph_fscache_can_enable, inode); - if (fscache_cookie_enabled(ci->fscache)) { - dout("fscache_file_set_cookie %p %p enabling cache\n", - inode, filp); - } + fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(fsc->fscache)) { + errorfc(fc, "Unable to register fscache cookie for %s", name); + err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP; + fsc->fscache = NULL; } + kfree(name); + return err; } void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fscache_cookie_valid(fsc->fscache)) { - struct ceph_fscache_entry *ent; - bool found = false; - - mutex_lock(&ceph_fscache_lock); - list_for_each_entry(ent, &ceph_fscache_list, list) { - if (ent->fscache == fsc->fscache) { - list_del(&ent->list); - kfree(ent); - found = true; - break; - } - } - WARN_ON_ONCE(!found); - mutex_unlock(&ceph_fscache_lock); - - __fscache_relinquish_cookie(fsc->fscache, NULL, false); - } - fsc->fscache = NULL; + fscache_relinquish_volume(fsc->fscache, NULL, false); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 058ea2a04376..09164389fa66 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -12,19 +12,19 @@ #include <linux/netfs.h> #ifdef CONFIG_CEPH_FSCACHE - -extern struct fscache_netfs ceph_cache_netfs; - -int ceph_fscache_register(void); -void ceph_fscache_unregister(void); +#include <linux/fscache.h> int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); -void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); -void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); + +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify); +void ceph_fscache_unuse_cookie(struct inode *inode, bool update); + +void ceph_fscache_update(struct inode *inode); +void ceph_fscache_invalidate(struct inode *inode, bool dio_write); static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { @@ -36,37 +36,51 @@ static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info return ci->fscache; } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { - fscache_invalidate(ceph_inode(inode)->fscache); + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + if (cookie) { + ceph_fscache_use_cookie(inode, true); + fscache_resize_cookie(cookie, to); + ceph_fscache_unuse_cookie(inode, true); + } } -static inline bool ceph_is_cache_enabled(struct inode *inode) +static inline void ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); + fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); +} + +static inline int ceph_fscache_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); - if (!cookie) - return false; - return fscache_cookie_enabled(cookie); + return fscache_set_page_dirty(page, ceph_fscache_cookie(ci)); } static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - return fscache_begin_read_operation(rreq, cookie); + return fscache_begin_read_operation(&rreq->cache_resources, cookie); } -#else -static inline int ceph_fscache_register(void) +static inline bool ceph_is_cache_enabled(struct inode *inode) { - return 0; + return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_unregister(void) +static inline void ceph_fscache_note_page_release(struct inode *inode) { -} + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_note_page_release(ceph_fscache_cookie(ci)); +} +#else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { @@ -81,28 +95,49 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { - return NULL; } -static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) { } -static inline void ceph_fscache_file_set_cookie(struct inode *inode, - struct file *filp) +static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update) { } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline void ceph_fscache_update(struct inode *inode) { } +static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write) +{ +} + +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) +{ +} + +static inline void ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) +{ +} + +static inline int ceph_fscache_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; @@ -112,6 +147,10 @@ static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { return -ENOBUFS; } -#endif -#endif /* _CEPH_CACHE_H */ +static inline void ceph_fscache_note_page_release(struct inode *inode) +{ +} +#endif /* CONFIG_CEPH_FSCACHE */ + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c447fa2e2d1f..b472cd066d1c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1856,7 +1856,7 @@ static int try_nonblocking_invalidate(struct inode *inode) u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode); + ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); @@ -2218,6 +2218,7 @@ static int unsafe_request_wait(struct inode *inode) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; + unsigned int max_sessions; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); @@ -2236,36 +2237,44 @@ static int unsafe_request_wait(struct inode *inode) spin_unlock(&ci->i_unsafe_lock); /* + * The mdsc->max_sessions is unlikely to be changed + * mostly, here we will retry it by reallocating the + * sessions array memory to get rid of the mdsc->mutex + * lock. + */ +retry: + max_sessions = mdsc->max_sessions; + + /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ - if (req1 || req2) { + if ((req1 || req2) && likely(max_sessions)) { struct ceph_mds_session **sessions = NULL; struct ceph_mds_session *s; struct ceph_mds_request *req; - unsigned int max; int i; - /* - * The mdsc->max_sessions is unlikely to be changed - * mostly, here we will retry it by reallocating the - * sessions arrary memory to get rid of the mdsc->mutex - * lock. - */ -retry: - max = mdsc->max_sessions; - sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO); - if (!sessions) - return -ENOMEM; + sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + if (!sessions) { + err = -ENOMEM; + goto out; + } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; - if (unlikely(s->s_mds >= max)) { + if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) + ceph_put_mds_session(s); + } + kfree(sessions); goto retry; } if (!sessions[s->s_mds]) { @@ -2278,8 +2287,14 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; - if (unlikely(s->s_mds >= max)) { + if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) + ceph_put_mds_session(s); + } + kfree(sessions); goto retry; } if (!sessions[s->s_mds]) { @@ -2300,7 +2315,7 @@ retry: spin_unlock(&ci->i_ceph_lock); /* send flush mdlog request to MDSes */ - for (i = 0; i < max; i++) { + for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); @@ -2317,15 +2332,19 @@ retry: ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; - ceph_mdsc_put_request(req1); } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; - ceph_mdsc_put_request(req2); } + +out: + if (req1) + ceph_mdsc_put_request(req1); + if (req2) + ceph_mdsc_put_request(req2); return err; } @@ -2388,6 +2407,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); dout("write_inode %p wait=%d\n", inode, wait); + ceph_fscache_unpin_writeback(inode, wbc); if (wait) { dirty = try_flush_caps(inode, &flush_tid); if (dirty) @@ -3375,8 +3395,7 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); - if (inode->i_nlink == 0 && - (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + if (inode->i_nlink == 0) deleted_inode = true; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c138e8126286..bbed3224ad68 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -248,8 +253,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) switch (inode->i_mode & S_IFMT) { case S_IFREG: - ceph_fscache_register_inode_cookie(inode); - ceph_fscache_file_set_cookie(inode, file); + ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE); fallthrough; case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, @@ -579,6 +583,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(dir); struct inode *inode; struct timespec64 now; + struct ceph_string *pool_ns; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_vino vino = { .ino = req->r_deleg_ino, .snap = CEPH_NOSNAP }; @@ -628,6 +633,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, in.max_size = cpu_to_le64(lo->stripe_unit); ceph_file_layout_to_legacy(lo, &in.layout); + /* lo is private, so pool_ns can't change */ + pool_ns = rcu_dereference_raw(lo->pool_ns); + if (pool_ns) { + iinfo.pool_ns_len = pool_ns->len; + iinfo.pool_ns_data = pool_ns->str; + } down_read(&mdsc->snap_rwsem); ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, @@ -746,8 +757,10 @@ retry: restore_deleg_ino(dir, req->r_deleg_ino); ceph_mdsc_put_request(req); try_async = false; + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto retry; } + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto out_req; } } @@ -822,6 +835,7 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p regular file %p\n", inode, file); WARN_ON(!list_empty(&fi->rw_contexts)); + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); ceph_put_fmode(ci, fi->fmode, 1); kmem_cache_free(ceph_file_cachep, fi); @@ -1218,7 +1232,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, snapc, snapc ? snapc->seq : 0); if (write) { - int ret2 = invalidate_inode_pages2_range(inode->i_mapping, + int ret2; + + ceph_fscache_invalidate(inode, true); + + ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) @@ -1429,6 +1447,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) return ret; + ceph_fscache_invalidate(inode, false); ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); @@ -1536,7 +1555,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1551,13 +1570,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1691,7 +1711,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1766,10 +1786,10 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) @@ -2113,6 +2133,7 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; filemap_invalidate_lock(inode->i_mapping); + ceph_fscache_invalidate(inode, false); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); @@ -2437,6 +2458,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out_caps; /* Drop dst file cached pages */ + ceph_fscache_invalidate(dst_inode, false); ret = invalidate_inode_pages2_range(dst_inode->i_mapping, dst_off >> PAGE_SHIFT, (dst_off + len) >> PAGE_SHIFT); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e3322fcb2e8d..ef4a980a7bf3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -564,6 +564,8 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); + if (inode->i_state & I_PINNING_FSCACHE_WB) + ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); @@ -634,6 +636,12 @@ int ceph_fill_file_size(struct inode *inode, int issued, } i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); + /* + * If we're expanding, then we should be able to just update + * the existing cookie. + */ + if (size > isize) + ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -666,10 +674,6 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } - - if (queue_trunc) - ceph_fscache_invalidate(inode); - return queue_trunc; } @@ -1053,6 +1057,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); + ceph_fscache_register_inode_cookie(inode); + if (fill_inline) ceph_fill_inline_data(inode, locked_page, iinfo->inline_data, iinfo->inline_len); @@ -1814,11 +1820,13 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); + ceph_fscache_update(inode); inode->i_blocks = calc_inode_blocks(size); ret = __ceph_should_report_size(ci); spin_unlock(&ci->i_ceph_lock); + return ret; } @@ -1844,6 +1852,8 @@ static void ceph_do_invalidate_pages(struct inode *inode) u32 orig_gen; int check = 0; + ceph_fscache_invalidate(inode, false); + mutex_lock(&ci->i_truncate_mutex); if (ceph_inode_is_shutdown(inode)) { @@ -1868,7 +1878,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode); + ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); @@ -1937,6 +1947,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); + ceph_fscache_resize(inode, to); truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); @@ -2184,7 +2195,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (mask) { req->r_inode = inode; ihold(inode); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c57699d8408d..0fcba68f9a99 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -160,8 +160,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("client%llu send metrics to mds%d\n", - ceph_client_gid(mdsc->fsc->client), s->s_mds); ceph_con_send(&s->s_con, msg); return true; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 620c691af40e..a338a3ec0dc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) /* if root is the real CephFS root, we don't have quota realms */ if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; /* otherwise, we can't know for sure */ return true; } @@ -494,10 +497,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) if (ci->i_max_bytes) { total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } /* It is possible for a quota to be exceeded. * Report 'zero' in that case */ free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } } spin_unlock(&ci->i_ceph_lock); if (total) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..bf79f369aec6 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#include <uapi/linux/magic.h> + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); @@ -146,6 +148,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -159,6 +162,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -197,8 +201,10 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -228,9 +234,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. * - * The source will look like: + * New device syntax will looks like: + * <device_spec>=/<path> + * where + * <device_spec> is name@fsid.fsname + * <path> is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) + * + * Old device syntax is: * <server_spec>[,<server_spec>...]:[<path>] * where * <server_spec> is <ip>[:<port>] @@ -263,24 +352,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +415,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +434,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -455,6 +568,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -474,6 +593,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +637,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,15 +696,22 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -787,16 +918,10 @@ static int __init init_caches(void) if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; - error = ceph_fscache_register(); - if (error) - goto bad_fscache; - return 0; -bad_fscache: - kmem_cache_destroy(ceph_mds_request_cachep); bad_pagevec_pool: - mempool_destroy(ceph_wb_pagevec_pool); + kmem_cache_destroy(ceph_mds_request_cachep); bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: @@ -828,8 +953,6 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); mempool_destroy(ceph_wb_pagevec_pool); - - ceph_fscache_unregister(); } static void __ceph_umount_begin(struct ceph_fs_client *fsc) @@ -1060,6 +1183,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1071,6 +1195,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); @@ -1156,6 +1282,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } @@ -1333,6 +1466,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..67f145e1ae7a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,17 +21,14 @@ #include <linux/ceph/libceph.h> #ifdef CONFIG_CEPH_FSCACHE -#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #endif -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ @@ -45,6 +42,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ @@ -89,6 +87,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -98,6 +98,7 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; struct ceph_fs_client { @@ -135,7 +136,7 @@ struct ceph_fs_client { #endif #ifdef CONFIG_CEPH_FSCACHE - struct fscache_cookie *fscache; + struct fscache_volume *fscache; #endif }; @@ -535,19 +536,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) * * These come from src/mds/mdstypes.h in the ceph sources. */ -#define CEPH_MAX_MDS 0x100 -#define CEPH_NUM_STRAY 10 +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { - if (vino.ino < CEPH_INO_SYSTEM_BASE && - vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { - WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); - return true; - } - return false; + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 87fcacdf3de7..cc8fdcb35b71 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -25,7 +25,7 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o -cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c deleted file mode 100644 index 8be57aaedab6..000000000000 --- a/fs/cifs/cache.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1 -/* - * CIFS filesystem cache index structure definitions - * - * Copyright (c) 2010 Novell, Inc. - * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> - * - */ -#include "fscache.h" -#include "cifs_debug.h" - -/* - * CIFS filesystem definition for FS-Cache - */ -struct fscache_netfs cifs_fscache_netfs = { - .name = "cifs", - .version = 0, -}; - -/* - * Register CIFS for caching with FS-Cache - */ -int cifs_fscache_register(void) -{ - return fscache_register_netfs(&cifs_fscache_netfs); -} - -/* - * Unregister CIFS for caching - */ -void cifs_fscache_unregister(void) -{ - fscache_unregister_netfs(&cifs_fscache_netfs); -} - -/* - * Server object for FS-Cache - */ -const struct fscache_cookie_def cifs_fscache_server_index_def = { - .name = "CIFS.server", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -static enum -fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct cifs_fscache_super_auxdata auxdata; - const struct cifs_tcon *tcon = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * Superblock object for FS-Cache - */ -const struct fscache_cookie_def cifs_fscache_super_index_def = { - .name = "CIFS.super", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .check_aux = cifs_fscache_super_check_aux, -}; - -static enum -fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct cifs_fscache_inode_auxdata auxdata; - struct cifsInodeInfo *cifsi = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -const struct fscache_cookie_def cifs_fscache_inode_object_def = { - .name = "CIFS.uniqueid", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = cifs_fscache_inode_check_aux, -}; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index d282caf9f037..ea00e1a91250 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -416,11 +416,17 @@ skip_rdma: from_kuid(&init_user_ns, ses->cred_uid)); spin_lock(&ses->chan_lock); + if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0)) + seq_puts(m, "\tPrimary channel: DISCONNECTED "); + if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); - for (j = 1; j < ses->chan_count; j++) + for (j = 1; j < ses->chan_count; j++) { cifs_dump_channel(m, j, &ses->chans[j]); + if (CIFS_CHAN_NEEDS_RECONNECT(ses, j)) + seq_puts(m, "\tDISCONNECTED "); + } } spin_unlock(&ses->chan_lock); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 353bd0dd7026..342717bf1dc2 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -84,9 +84,9 @@ struct key_type cifs_spnego_key_type = { /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * -cifs_get_spnego_key(struct cifs_ses *sesInfo) +cifs_get_spnego_key(struct cifs_ses *sesInfo, + struct TCP_Server_Info *server) { - struct TCP_Server_Info *server = cifs_ses_server(sesInfo); struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; char *description, *dp; diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index e6a0451877d4..7f102ffeb675 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -29,7 +29,8 @@ struct cifs_spnego_msg { #ifdef __KERNEL__ extern struct key_type cifs_spnego_key_type; -extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo); +extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo, + struct TCP_Server_Info *server); #endif /* KERNEL */ #endif /* _CIFS_SPNEGO_H */ diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 23a1ed2fb769..cdce1609c5c2 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - cifs_ses_mark_for_reconnect(swnreg->tcon->ses); + cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - cifs_ses_mark_for_reconnect(swnreg->tcon->ses); + cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); @@ -498,10 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a goto unlock; } - spin_lock(&GlobalMid_Lock); - if (tcon->ses->server->tcpStatus != CifsExiting) - tcon->ses->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, false); unlock: mutex_unlock(&tcon->ses->server->srv_mutex); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index ee3aab3dd4ac..bf861fef2f0c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -949,6 +949,9 @@ static void populate_new_aces(char *nacl_base, pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += setup_authusers_ACE(pnntace); + num_aces++; goto set_size; } @@ -1297,7 +1300,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, if (uid_valid(uid)) { /* chown */ uid_t id; - nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!nowner_sid_ptr) { rc = -ENOMEM; @@ -1326,7 +1329,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, } if (gid_valid(gid)) { /* chgrp */ gid_t id; - ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!ngroup_sid_ptr) { rc = -ENOMEM; @@ -1613,7 +1616,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, nsecdesclen = secdesclen; if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ if (mode_from_sid) - nsecdesclen += sizeof(struct cifs_ace); + nsecdesclen += 2 * sizeof(struct cifs_ace); else /* cifsacl */ nsecdesclen += 5 * sizeof(struct cifs_ace); } else { /* chown */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index d118282071b3..0912d8bbbac1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -141,9 +141,13 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; + spin_lock(&cifs_tcp_ses_lock); if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || - server->tcpStatus == CifsNeedNegotiate) + server->tcpStatus == CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); return rc; + } + spin_unlock(&cifs_tcp_ses_lock); if (!server->session_estab) { memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index dca42aa87d30..082c21478686 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -26,6 +26,7 @@ #include <linux/random.h> #include <linux/uuid.h> #include <linux/xattr.h> +#include <uapi/linux/magic.h> #include <net/ipv6.h> #include "cifsfs.h" #include "cifspdu.h" @@ -202,7 +203,7 @@ cifs_read_super(struct super_block *sb) sb->s_time_max = ts.tv_sec; } - sb->s_magic = CIFS_MAGIC_NUMBER; + sb->s_magic = CIFS_SUPER_MAGIC; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; rc = super_setup_bdi(sb); @@ -396,6 +397,9 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); + if (inode->i_state & I_PINNING_FSCACHE_WB) + cifs_fscache_unuse_inode_cookie(inode, true); + cifs_fscache_release_inode_cookie(inode); clear_inode(inode); } @@ -720,6 +724,12 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) } #endif +static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); + return 0; +} + static int cifs_drop_inode(struct inode *inode) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -732,6 +742,7 @@ static int cifs_drop_inode(struct inode *inode) static const struct super_operations cifs_super_ops = { .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, + .write_inode = cifs_write_inode, .free_inode = cifs_free_inode, .drop_inode = cifs_drop_inode, .evict_inode = cifs_evict_inode, @@ -773,7 +784,7 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb) sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); - p = s = full_path; + s = full_path; do { struct inode *dir = d_inode(dentry); @@ -908,6 +919,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); + return root; out: if (cifs_sb) { kfree(cifs_sb->prepath); @@ -1624,13 +1636,9 @@ init_cifs(void) goto out_destroy_cifsoplockd_wq; } - rc = cifs_fscache_register(); - if (rc) - goto out_destroy_deferredclose_wq; - rc = cifs_init_inodecache(); if (rc) - goto out_unreg_fscache; + goto out_destroy_deferredclose_wq; rc = cifs_init_mids(); if (rc) @@ -1692,8 +1700,6 @@ out_destroy_mids: cifs_destroy_mids(); out_destroy_inodecache: cifs_destroy_inodecache(); -out_unreg_fscache: - cifs_fscache_unregister(); out_destroy_deferredclose_wq: destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: @@ -1729,7 +1735,6 @@ exit_cifs(void) cifs_destroy_request_bufs(); cifs_destroy_mids(); cifs_destroy_inodecache(); - cifs_fscache_unregister(); destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9e5d9e192ef0..15a5c5db038b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,6 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.34" +#define SMB3_PRODUCT_BUILD 35 +#define CIFS_VERSION "2.35" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index be74606724c7..48b343d03430 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -24,8 +24,6 @@ #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" -#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ - #define SMB_PATH_MAX 260 #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -113,7 +111,14 @@ enum statusEnum { CifsGood, CifsExiting, CifsNeedReconnect, - CifsNeedNegotiate + CifsNeedNegotiate, + CifsInNegotiate, + CifsNeedSessSetup, + CifsInSessSetup, + CifsNeedTcon, + CifsInTcon, + CifsNeedFilesInvalidate, + CifsInFilesInvalidate }; enum securityEnum { @@ -263,13 +268,16 @@ struct smb_version_operations { /* check if we need to negotiate */ bool (*need_neg)(struct TCP_Server_Info *); /* negotiate to the server */ - int (*negotiate)(const unsigned int, struct cifs_ses *); + int (*negotiate)(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server); /* set negotiated write size */ unsigned int (*negotiate_wsize)(struct cifs_tcon *tcon, struct smb3_fs_context *ctx); /* set negotiated read size */ unsigned int (*negotiate_rsize)(struct cifs_tcon *tcon, struct smb3_fs_context *ctx); /* setup smb sessionn */ int (*sess_setup)(const unsigned int, struct cifs_ses *, + struct TCP_Server_Info *server, const struct nls_table *); /* close smb session */ int (*logoff)(const unsigned int, struct cifs_ses *); @@ -414,7 +422,8 @@ struct smb_version_operations { void (*set_lease_key)(struct inode *, struct cifs_fid *); /* generate new lease key */ void (*new_lease_key)(struct cifs_fid *); - int (*generate_signingkey)(struct cifs_ses *); + int (*generate_signingkey)(struct cifs_ses *ses, + struct TCP_Server_Info *server); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *, bool allocate_crypto); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, @@ -582,7 +591,7 @@ struct TCP_Server_Info { char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; struct smb_version_operations *ops; struct smb_version_values *vals; - /* updates to tcpStatus protected by GlobalMid_Lock */ + /* updates to tcpStatus protected by cifs_tcp_ses_lock */ enum statusEnum tcpStatus; /* what we think the status is */ char *hostname; /* hostname portion of UNC string */ struct socket *ssocket; @@ -659,9 +668,6 @@ struct TCP_Server_Info { unsigned int total_read; /* total amount of data read in this pass */ atomic_t in_send; /* requests trying to send */ atomic_t num_waiters; /* blocked waiting to get in sendrecv */ -#ifdef CONFIG_CIFS_FSCACHE - struct fscache_cookie *fscache; /* client index cache cookie */ -#endif #ifdef CONFIG_CIFS_STATS2 atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */ atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */ @@ -915,12 +921,13 @@ struct cifs_chan { */ struct cifs_ses { struct list_head smb_ses_list; + struct list_head rlist; /* reconnect list */ struct list_head tcon_list; struct cifs_tcon *tcon_ipc; struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ - enum statusEnum status; /* updates protected by GlobalMid_Lock */ + enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */ unsigned overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -939,17 +946,13 @@ struct cifs_ses { struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ - bool need_reconnect:1; /* connection reset, uid now invalid */ bool domainAuto:1; - bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; - __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; - /* * Network interfaces available on the server this session is * connected to. @@ -969,45 +972,34 @@ struct cifs_ses { spinlock_t chan_lock; /* ========= begin: protected by chan_lock ======== */ #define CIFS_MAX_CHANNELS 16 +#define CIFS_ALL_CHANNELS_SET(ses) \ + ((1UL << (ses)->chan_count) - 1) +#define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \ + ((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses)) +#define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \ + ((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses)) +#define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \ + test_bit((index), &(ses)->chans_need_reconnect) + struct cifs_chan chans[CIFS_MAX_CHANNELS]; - struct cifs_chan *binding_chan; size_t chan_count; size_t chan_max; atomic_t chan_seq; /* round robin state */ + + /* + * chans_need_reconnect is a bitmap indicating which of the channels + * under this smb session needs to be reconnected. + * If not multichannel session, only one bit will be used. + * + * We will ask for sess and tcon reconnection only if all the + * channels are marked for needing reconnection. This will + * enable the sessions on top to continue to live till any + * of the channels below are active. + */ + unsigned long chans_need_reconnect; /* ========= end: protected by chan_lock ======== */ }; -/* - * When binding a new channel, we need to access the channel which isn't fully - * established yet. - */ - -static inline -struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses) -{ - if (ses->binding) - return ses->binding_chan; - else - return NULL; -} - -/* - * Returns the server pointer of the session. When binding a new - * channel this returns the last channel which isn't fully established - * yet. - * - * This function should be use for negprot/sess.setup codepaths. For - * the other requests see cifs_pick_channel(). - */ -static inline -struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses) -{ - if (ses->binding) - return ses->binding_chan->server; - else - return ses->server; -} - static inline bool cap_unix(struct cifs_ses *ses) { @@ -1117,7 +1109,7 @@ struct cifs_tcon { __u32 max_bytes_copy; #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ - struct fscache_cookie *fscache; /* cookie for share */ + struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ struct cached_fid crfid; /* Cached root fid */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index d2ff438fd31f..68b9a436af4b 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2560,7 +2560,7 @@ typedef struct { __le32 EaSize; /* length of the xattrs */ __u8 ShortNameLength; __u8 Reserved; - __u8 ShortName[12]; + __u8 ShortName[24]; char FileName[1]; } __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4f5a3e857df4..d3701295402d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -131,7 +131,11 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *in_buf , struct smb_hdr *out_buf, int *bytes_returned); -extern int cifs_reconnect(struct TCP_Server_Info *server); +void +cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session); +extern int cifs_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session); extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); @@ -164,6 +168,7 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct, extern enum securityEnum select_sectype(struct TCP_Server_Info *server, enum securityEnum requested); extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); extern struct timespec64 cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec64); @@ -293,11 +298,15 @@ extern int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc); extern int cifs_negotiate_protocol(const unsigned int xid, - struct cifs_ses *ses); + struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, struct nls_table *nls_info); extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required); -extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); +extern int CIFSSMBNegotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, @@ -504,8 +513,10 @@ extern int cifs_verify_signature(struct smb_rqst *rqst, extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server); extern int calc_seckey(struct cifs_ses *); -extern int generate_smb30signingkey(struct cifs_ses *); -extern int generate_smb311signingkey(struct cifs_ses *); +extern int generate_smb30signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server); +extern int generate_smb311signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, @@ -601,6 +612,19 @@ bool is_server_using_iface(struct TCP_Server_Info *server, bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); +unsigned int +cifs_ses_get_chan_index(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void +cifs_chan_set_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +void +cifs_chan_clear_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); +bool +cifs_chan_needs_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server); + void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov, @@ -626,6 +650,11 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, int match_target_ip(struct TCP_Server_Info *server, const char *share, size_t share_len, bool *result); + +int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *dfs_link_path); #endif static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 243d17696f06..071e2f21a7db 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -73,6 +73,16 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp; struct list_head *tmp1; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + tcon->tidStatus != CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + tcon->tidStatus = CifsInFilesInvalidate; + spin_unlock(&cifs_tcp_ses_lock); + /* list all files open on tree connection and mark them invalid */ spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { @@ -89,6 +99,11 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInFilesInvalidate) + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. @@ -120,15 +135,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * only tree disconnect, open, and write, (and ulogoff which does not * have tcon) are allowed as we start force umount */ + spin_lock(&cifs_tcp_ses_lock); if (tcon->tidStatus == CifsExiting) { if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "can not send cmd %d while umounting\n", smb_command); return -ENODEV; } } + spin_unlock(&cifs_tcp_ses_lock); retries = server->nr_targets; @@ -148,8 +166,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } /* are we still trying to reconnect? */ - if (server->tcpStatus != CifsNeedReconnect) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); break; + } + spin_unlock(&cifs_tcp_ses_lock); if (retries && --retries) continue; @@ -166,31 +188,49 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) retries = server->nr_targets; } - if (!ses->need_reconnect && !tcon->need_reconnect) + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { + spin_unlock(&ses->chan_lock); return 0; + } + spin_unlock(&ses->chan_lock); nls_codepage = load_nls_default(); /* - * need to prevent multiple threads trying to simultaneously - * reconnect the same SMB session - */ - mutex_lock(&ses->session_mutex); - - /* * Recheck after acquire mutex. If another thread is negotiating * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); rc = -EHOSTDOWN; - mutex_unlock(&ses->session_mutex); goto out; } + spin_unlock(&cifs_tcp_ses_lock); - rc = cifs_negotiate_protocol(0, ses); - if (rc == 0 && ses->need_reconnect) - rc = cifs_setup_session(0, ses, nls_codepage); + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + + /* this means that we only need to tree connect */ + if (tcon->need_reconnect) + goto skip_sess_setup; + + rc = -EHOSTDOWN; + goto out; + } + spin_unlock(&ses->chan_lock); + + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses, server); + if (!rc) + rc = cifs_setup_session(0, ses, server, nls_codepage); /* do we need to reconnect tcon? */ if (rc || !tcon->need_reconnect) { @@ -198,6 +238,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) goto out; } +skip_sess_setup: cifs_mark_open_files_invalid(tcon); rc = cifs_tree_connect(0, tcon, nls_codepage); mutex_unlock(&ses->session_mutex); @@ -337,8 +378,13 @@ static int smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon, void **request_buf, void **response_buf) { - if (tcon->ses->need_reconnect || tcon->need_reconnect) + spin_lock(&tcon->ses->chan_lock); + if (cifs_chan_needs_reconnect(tcon->ses, tcon->ses->server) || + tcon->need_reconnect) { + spin_unlock(&tcon->ses->chan_lock); return -EHOSTDOWN; + } + spin_unlock(&tcon->ses->chan_lock); return __smb_init(smb_command, wct, tcon, request_buf, response_buf); } @@ -476,14 +522,15 @@ should_set_ext_sec_flag(enum securityEnum sectype) } int -CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) +CIFSSMBNegotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { NEGOTIATE_REQ *pSMB; NEGOTIATE_RSP *pSMBr; int rc = 0; int bytes_returned; int i; - struct TCP_Server_Info *server = ses->server; u16 count; if (!server) { @@ -600,8 +647,12 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) * the tcon is no longer on the list, so no need to take lock before * checking this. */ - if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) - return 0; + spin_lock(&tcon->ses->chan_lock); + if ((tcon->need_reconnect) || CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses)) { + spin_unlock(&tcon->ses->chan_lock); + return -EIO; + } + spin_unlock(&tcon->ses->chan_lock); rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon, (void **)&smb_buffer); @@ -696,9 +747,14 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) return -EIO; mutex_lock(&ses->session_mutex); - if (ses->need_reconnect) + spin_lock(&ses->chan_lock); + if (CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { + spin_unlock(&ses->chan_lock); goto session_already_dead; /* no need to send SMBlogoff if uid already closed due to reconnect */ + } + spin_unlock(&ses->chan_lock); + rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); if (rc) { mutex_unlock(&ses->session_mutex); @@ -1401,7 +1457,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { - cifs_reconnect(server); + cifs_reconnect(server, true); return -1; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1060164b984a..053cb449eb16 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -162,42 +162,68 @@ static void cifs_resolve_server(struct work_struct *work) mutex_unlock(&server->srv_mutex); } -/** +/* * Mark all sessions and tcons for reconnect. * * @server needs to be previously set to CifsNeedReconnect. + * */ -static void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server) +void +cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; - struct mid_q_entry *mid, *nmid; - struct list_head retry_list; - struct TCP_Server_Info *pserver; - - server->maxBuf = 0; - server->max_read = 0; - cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); - trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname); /* * before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they * are not used until reconnected. */ - cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n", __func__); + cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__); /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - ses->need_reconnect = true; - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) + spin_lock(&ses->chan_lock); + if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) + goto next_session; + + if (mark_smb_session) + CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses); + else + cifs_chan_set_need_reconnect(ses, server); + + /* If all channels need reconnect, then tcon needs reconnect */ + if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) + goto next_session; + + ses->status = CifsNeedReconnect; + + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; + tcon->tidStatus = CifsNeedReconnect; + } if (ses->tcon_ipc) ses->tcon_ipc->need_reconnect = true; + +next_session: + spin_unlock(&ses->chan_lock); } spin_unlock(&cifs_tcp_ses_lock); +} + +static void +cifs_abort_connection(struct TCP_Server_Info *server) +{ + struct mid_q_entry *mid, *nmid; + struct list_head retry_list; + + server->maxBuf = 0; + server->max_read = 0; /* do not want to be sending data on a socket we are freeing */ cifs_dbg(FYI, "%s: tearing down socket\n", __func__); @@ -248,16 +274,21 @@ static void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets) { - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->nr_targets = num_targets; if (server->tcpStatus == CifsExiting) { /* the demux thread will exit normally next time through the loop */ - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); wake_up(&server->response_q); return false; } + + cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); + trace_smb3_reconnect(server->CurrentMid, server->conn_id, + server->hostname); server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + + spin_unlock(&cifs_tcp_ses_lock); return true; } @@ -268,15 +299,23 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num * mark all smb sessions as reconnecting for tcp session * reconnect tcp session * wake up waiters on reconnection? - (not needed currently) + * + * if mark_smb_session is passed as true, unconditionally mark + * the smb session (and tcon) for reconnect as well. This value + * doesn't really matter for non-multichannel scenario. + * */ -static int __cifs_reconnect(struct TCP_Server_Info *server) +static int __cifs_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session) { int rc = 0; if (!cifs_tcp_ses_needs_reconnect(server, 1)) return 0; - cifs_mark_tcp_ses_conns_for_reconnect(server); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); + + cifs_abort_connection(server); do { try_to_freeze(); @@ -299,17 +338,20 @@ static int __cifs_reconnect(struct TCP_Server_Info *server) } else { atomic_inc(&tcpSesReconnectCount); set_credits(server, 1); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } } while (server->tcpStatus == CifsNeedReconnect); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); + spin_unlock(&cifs_tcp_ses_lock); wake_up(&server->response_q); return rc; @@ -371,7 +413,9 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ return rc; } -static int reconnect_dfs_server(struct TCP_Server_Info *server) +static int +reconnect_dfs_server(struct TCP_Server_Info *server, + bool mark_smb_session) { int rc = 0; const char *refpath = server->current_fullpath + 1; @@ -395,7 +439,9 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) if (!cifs_tcp_ses_needs_reconnect(server, num_targets)) return 0; - cifs_mark_tcp_ses_conns_for_reconnect(server); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); + + cifs_abort_connection(server); do { try_to_freeze(); @@ -416,12 +462,13 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) */ atomic_inc(&tcpSesReconnectCount); set_credits(server, 1); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); if (target_hint) @@ -430,29 +477,32 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) dfs_cache_free_tgts(&tl); /* Need to set up echo worker again once connection has been established */ + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); + spin_unlock(&cifs_tcp_ses_lock); + wake_up(&server->response_q); return rc; } -int cifs_reconnect(struct TCP_Server_Info *server) +int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ spin_lock(&cifs_tcp_ses_lock); if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) { spin_unlock(&cifs_tcp_ses_lock); - return __cifs_reconnect(server); + return __cifs_reconnect(server, mark_smb_session); } spin_unlock(&cifs_tcp_ses_lock); - return reconnect_dfs_server(server); + return reconnect_dfs_server(server, mark_smb_session); } #else -int cifs_reconnect(struct TCP_Server_Info *server) +int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { - return __cifs_reconnect(server); + return __cifs_reconnect(server, mark_smb_session); } #endif @@ -534,15 +584,18 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ + spin_lock(&cifs_tcp_ses_lock); if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && (!server->ops->can_echo || server->ops->can_echo(server)) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { + spin_unlock(&cifs_tcp_ses_lock); cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); - cifs_reconnect(server); + cifs_reconnect(server, false); return true; } + spin_unlock(&cifs_tcp_ses_lock); return false; } @@ -576,7 +629,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) /* reconnect if no credits and no requests in flight */ if (zero_credits(server)) { - cifs_reconnect(server); + cifs_reconnect(server, false); return -ECONNABORTED; } @@ -587,13 +640,18 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) else length = sock_recvmsg(server->ssocket, smb_msg, 0); - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ESHUTDOWN; + } if (server->tcpStatus == CifsNeedReconnect) { - cifs_reconnect(server); + spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(server, false); return -ECONNABORTED; } + spin_unlock(&cifs_tcp_ses_lock); if (length == -ERESTARTSYS || length == -EAGAIN || @@ -610,7 +668,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (length <= 0) { cifs_dbg(FYI, "Received no data or error: %d\n", length); - cifs_reconnect(server); + cifs_reconnect(server, false); return -ECONNABORTED; } } @@ -689,11 +747,11 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) * initialize frame). */ cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); - cifs_reconnect(server); + cifs_reconnect(server, true); break; default: cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); - cifs_reconnect(server); + cifs_reconnect(server, true); } return false; @@ -771,9 +829,9 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) cancel_delayed_work_sync(&server->echo); cancel_delayed_work_sync(&server->resolve); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->tcpStatus = CifsExiting; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); wake_up_all(&server->response_q); /* check if we have blocked requests that need to free */ @@ -866,7 +924,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - server->vals->header_preamble_size) { cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); - cifs_reconnect(server); + cifs_reconnect(server, true); return -ECONNABORTED; } @@ -913,7 +971,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { - cifs_reconnect(server); + cifs_reconnect(server, true); return -1; } @@ -1017,7 +1075,7 @@ next_pdu: server->vals->header_preamble_size) { cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n", server->pdu_size); - cifs_reconnect(server); + cifs_reconnect(server, true); continue; } @@ -1069,7 +1127,7 @@ next_pdu: server->ops->is_status_io_timeout(buf)) { num_io_timeout++; if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { - cifs_reconnect(server); + cifs_reconnect(server, false); num_io_timeout = 0; continue; } @@ -1139,7 +1197,7 @@ next_pdu: } memalloc_noreclaim_restore(noreclaim_flag); - module_put_and_exit(0); + module_put_and_kthread_exit(0); } /* @@ -1390,16 +1448,12 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) else cancel_delayed_work_sync(&server->reconnect); - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_tcp_ses_lock); server->tcpStatus = CifsExiting; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_crypto_secmech_release(server); - /* fscache server cookies are based on primary channel only */ - if (!CIFS_SERVER_IS_CHAN(server)) - cifs_fscache_release_client_cookie(server); - kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; @@ -1545,7 +1599,9 @@ smbd_connected: * to the struct since the kernel thread not created yet * no need to spinlock this update of tcpStatus */ + spin_lock(&cifs_tcp_ses_lock); tcp_ses->tcpStatus = CifsNeedNegotiate; + spin_unlock(&cifs_tcp_ses_lock); if ((ctx->max_credits < 20) || (ctx->max_credits > 60000)) tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE; @@ -1559,14 +1615,6 @@ smbd_connected: list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); - /* fscache server cookies are based on primary channel only */ - if (!CIFS_SERVER_IS_CHAN(tcp_ses)) - cifs_fscache_get_client_cookie(tcp_ses); -#ifdef CONFIG_CIFS_FSCACHE - else - tcp_ses->fscache = tcp_ses->primary_server->fscache; -#endif /* CONFIG_CIFS_FSCACHE */ - /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); @@ -1762,15 +1810,13 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); return; } - spin_unlock(&cifs_tcp_ses_lock); /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); - spin_lock(&GlobalMid_Lock); if (ses->status == CifsGood) ses->status = CifsExiting; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_free_ipc(ses); @@ -1789,23 +1835,19 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_lock(&ses->chan_lock); chan_count = ses->chan_count; - spin_unlock(&ses->chan_lock); /* close any extra channels */ if (chan_count > 1) { int i; for (i = 1; i < chan_count; i++) { - /* - * note: for now, we're okay accessing ses->chans - * without chan_lock. But when chans can go away, we'll - * need to introduce ref counting to make sure that chan - * is not freed from under us. - */ + spin_unlock(&ses->chan_lock); cifs_put_tcp_session(ses->chans[i].server, 0); + spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } + spin_unlock(&ses->chan_lock); sesInfoFree(ses); cifs_put_tcp_session(server, 0); @@ -1945,6 +1987,19 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } } + ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL); + if (!ctx->workstation_name) { + cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n"); + rc = -ENOMEM; + kfree(ctx->username); + ctx->username = NULL; + kfree_sensitive(ctx->password); + ctx->password = NULL; + kfree(ctx->domainname); + ctx->domainname = NULL; + goto out_key_put; + } + out_key_put: up_read(&key->sem); key_put(key); @@ -1987,11 +2042,13 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", ses->status); - mutex_lock(&ses->session_mutex); - if (ses->need_reconnect) { + spin_lock(&ses->chan_lock); + if (cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "Session needs reconnect\n"); - rc = cifs_negotiate_protocol(xid, ses); + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses, server); if (rc) { mutex_unlock(&ses->session_mutex); /* problem -- put our ses reference */ @@ -2000,7 +2057,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) return ERR_PTR(rc); } - rc = cifs_setup_session(xid, ses, + rc = cifs_setup_session(xid, ses, server, ctx->local_nls); if (rc) { mutex_unlock(&ses->session_mutex); @@ -2009,8 +2066,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) free_xid(xid); return ERR_PTR(rc); } + mutex_unlock(&ses->session_mutex); + + spin_lock(&ses->chan_lock); } - mutex_unlock(&ses->session_mutex); + spin_unlock(&ses->chan_lock); /* existing SMB ses has a server reference already */ cifs_put_tcp_session(server, 0); @@ -2060,28 +2120,35 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses->sectype = ctx->sectype; ses->sign = ctx->sign; - mutex_lock(&ses->session_mutex); /* add server as first channel */ spin_lock(&ses->chan_lock); ses->chans[0].server = server; ses->chan_count = 1; ses->chan_max = ctx->multichannel ? ctx->max_channels:1; + ses->chans_need_reconnect = 1; spin_unlock(&ses->chan_lock); - rc = cifs_negotiate_protocol(xid, ses); + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses, server); if (!rc) - rc = cifs_setup_session(xid, ses, ctx->local_nls); + rc = cifs_setup_session(xid, ses, server, ctx->local_nls); + mutex_unlock(&ses->session_mutex); /* each channel uses a different signing key */ + spin_lock(&ses->chan_lock); memcpy(ses->chans[0].signkey, ses->smb3signingkey, sizeof(ses->smb3signingkey)); + spin_unlock(&ses->chan_lock); - mutex_unlock(&ses->session_mutex); if (rc) goto get_ses_fail; - /* success, put it on the list and add it as first channel */ + /* + * success, put it on the list and add it as first channel + * note: the session becomes active soon after this. So you'll + * need to lock before changing something in the session. + */ spin_lock(&cifs_tcp_ses_lock); list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2161,6 +2228,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) /* tc_count can never go negative */ WARN_ON(tcon->tc_count < 0); + list_del_init(&tcon->tcon_list); + spin_unlock(&cifs_tcp_ses_lock); + if (tcon->use_witness) { int rc; @@ -2171,9 +2241,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) } } - list_del_init(&tcon->tcon_list); - spin_unlock(&cifs_tcp_ses_lock); - xid = get_xid(); if (ses->server->ops->tree_disconnect) ses->server->ops->tree_disconnect(xid, tcon); @@ -2283,17 +2350,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) if (ses->server->posix_ext_supported) { tcon->posix_extensions = true; pr_warn_once("SMB3.11 POSIX Extensions are experimental\n"); - } else { + } else if ((ses->server->vals->protocol_id == SMB311_PROT_ID) || + (strcmp(ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) || + (strcmp(ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0)) { cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n"); rc = -EOPNOTSUPP; goto out_fail; + } else { + cifs_dbg(VFS, "Check vers= mount option. SMB3.11 " + "disabled but required for POSIX extensions\n"); + rc = -EOPNOTSUPP; + goto out_fail; } } - /* - * BB Do we need to wrap session_mutex around this TCon call and Unix - * SetFS as we do on SessSetup and reconnect? - */ xid = get_xid(); rc = ses->server->ops->tree_connect(xid, ses, ctx->UNC, tcon, ctx->local_nls); @@ -3029,12 +3101,15 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) * for just this mount. */ reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx); + spin_lock(&cifs_tcp_ses_lock); if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && (le64_to_cpu(tcon->fsUnixInfo.Capability) & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { + spin_unlock(&cifs_tcp_ses_lock); rc = -EACCES; goto out; } + spin_unlock(&cifs_tcp_ses_lock); } else tcon->unix_ext = 0; /* server does not support them */ @@ -3069,7 +3144,8 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) * Inside cifs_fscache_get_super_cookie it checks * that we do not get super cookie twice. */ - cifs_fscache_get_super_cookie(tcon); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + cifs_fscache_get_super_cookie(tcon); out: mnt_ctx->server = server; @@ -3322,6 +3398,11 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, full_path); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (rc == -ENOENT && is_tcon_dfs(tcon)) + rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, + full_path); +#endif if (rc != 0 && rc != -EREMOTE) { kfree(full_path); return rc; @@ -3709,8 +3790,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (rc == 0) { bool is_unicode; - tcon->tidStatus = CifsGood; - tcon->need_reconnect = false; tcon->tid = smb_buffer_response->Tid; bcc_ptr = pByteArea(smb_buffer_response); bytes_left = get_bcc(smb_buffer_response); @@ -3799,26 +3878,37 @@ cifs_umount(struct cifs_sb_info *cifs_sb) } int -cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) +cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server) { int rc = 0; - struct TCP_Server_Info *server = cifs_ses_server(ses); if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; /* only send once per connect */ - if (!server->ops->need_neg(server)) + spin_lock(&cifs_tcp_ses_lock); + if (!server->ops->need_neg(server) || + server->tcpStatus != CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); return 0; + } + server->tcpStatus = CifsInNegotiate; + spin_unlock(&cifs_tcp_ses_lock); - rc = server->ops->negotiate(xid, ses); + rc = server->ops->negotiate(xid, ses, server); if (rc == 0) { - spin_lock(&GlobalMid_Lock); - if (server->tcpStatus == CifsNeedNegotiate) - server->tcpStatus = CifsGood; + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInNegotiate) + server->tcpStatus = CifsNeedSessSetup; else rc = -EHOSTDOWN; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInNegotiate) + server->tcpStatus = CifsNeedNegotiate; + spin_unlock(&cifs_tcp_ses_lock); } return rc; @@ -3826,12 +3916,26 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, struct nls_table *nls_info) { int rc = -ENOSYS; - struct TCP_Server_Info *server = cifs_ses_server(ses); + bool is_binding = false; - if (!ses->binding) { + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus != CifsNeedSessSetup) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + server->tcpStatus = CifsInSessSetup; + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) ses->capabilities &= (~server->vals->cap_unix); @@ -3849,10 +3953,26 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, server->sec_mode, server->capabilities, server->timeAdj); if (server->ops->sess_setup) - rc = server->ops->sess_setup(xid, ses, nls_info); + rc = server->ops->sess_setup(xid, ses, server, nls_info); - if (rc) + if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInSessSetup) + server->tcpStatus = CifsNeedSessSetup; + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInSessSetup) + server->tcpStatus = CifsGood; + /* Even if one channel is active, session is in good state */ + ses->status = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); + } return rc; } @@ -4296,7 +4416,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - cifs_ses_mark_for_reconnect(tcon->ses); + cifs_reconnect(tcon->ses->server, true); } dfs_cache_free_tgts(tl); @@ -4314,9 +4434,22 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru char *tree; struct dfs_info3_param ref = {0}; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon)) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tcon->tidStatus = CifsInTcon; + spin_unlock(&cifs_tcp_ses_lock); + tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); - if (!tree) - return -ENOMEM; + if (!tree) { + rc = -ENOMEM; + goto out; + } if (tcon->ipc) { scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); @@ -4348,13 +4481,52 @@ out: kfree(tree); cifs_put_tcp_super(sb); + if (rc) { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + tcon->need_reconnect = false; + } + return rc; } #else int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc) { + int rc; const struct smb_version_operations *ops = tcon->ses->server->ops; - return ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon)) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tcon->tidStatus = CifsInTcon; + spin_unlock(&cifs_tcp_ses_lock); + + rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + if (rc) { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + tcon->need_reconnect = false; + } + + return rc; } #endif diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index e9b0fa2a9614..831f42458bf6 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - cifs_ses_mark_for_reconnect(tcon->ses); + cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, true); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 6e8e7cc26ae2..ce9b22aecfba 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -22,6 +22,7 @@ #include "cifs_unicode.h" #include "fs_context.h" #include "cifs_ioctl.h" +#include "fscache.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -507,8 +508,12 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); rc = -ENOMEM; + goto out; } + fscache_use_cookie(cifs_inode_cookie(file_inode(file)), + file->f_mode & FMODE_WRITE); + out: cifs_put_tlink(tlink); out_free_xid: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9fee3af83a73..e7af802dcfa6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -376,8 +376,6 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) struct cifsLockInfo *li, *tmp; struct super_block *sb = inode->i_sb; - cifs_fscache_release_inode_cookie(inode); - /* * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. @@ -570,7 +568,7 @@ int cifs_open(struct inode *inode, struct file *file) spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); - goto out; + goto use_cache; } else { _cifsFileInfo_put(cfile, true, false); } @@ -632,8 +630,6 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } - cifs_fscache_set_inode_cookie(inode, file); - if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { /* * Time to set mode which we can not set earlier due to @@ -652,6 +648,15 @@ int cifs_open(struct inode *inode, struct file *file) cfile->pid); } +use_cache: + fscache_use_cookie(cifs_inode_cookie(file_inode(file)), + file->f_mode & FMODE_WRITE); + if (file->f_flags & O_DIRECT && + (!((file->f_flags & O_ACCMODE) != O_RDONLY) || + file->f_flags & O_APPEND)) + cifs_invalidate_cache(file_inode(file), + FSCACHE_INVAL_DIO_WRITE); + out: free_dentry_path(page); free_xid(xid); @@ -876,6 +881,8 @@ int cifs_close(struct inode *inode, struct file *file) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_deferred_close *dclose; + cifs_fscache_unuse_inode_cookie(inode, file->f_mode & FMODE_WRITE); + if (file->private_data != NULL) { cfile = file->private_data; file->private_data = NULL; @@ -886,7 +893,6 @@ int cifs_close(struct inode *inode, struct file *file) dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode->i_ctime = inode->i_mtime = current_time(inode); - cifs_fscache_update_inode_cookie(inode); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -4198,10 +4204,12 @@ static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - cifs_fscache_wait_on_page_write(inode, page); +#ifdef CONFIG_CIFS_FSCACHE + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; +#endif lock_page(page); return VM_FAULT_LOCKED; @@ -4261,8 +4269,6 @@ cifs_readv_complete(struct work_struct *work) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; - lru_cache_add(page); - if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); @@ -4270,13 +4276,11 @@ cifs_readv_complete(struct work_struct *work) } else SetPageError(page); - unlock_page(page); - if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); - else - cifs_fscache_uncache_page(rdata->mapping->host, page); + + unlock_page(page); got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); @@ -4334,7 +4338,6 @@ readpages_fill_pages(struct TCP_Server_Info *server, * fill them until the writes are flushed. */ zero_user(page, 0, PAGE_SIZE); - lru_cache_add(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); @@ -4344,7 +4347,6 @@ readpages_fill_pages(struct TCP_Server_Info *server, continue; } else { /* no need to hold page hostage */ - lru_cache_add(page); unlock_page(page); put_page(page); rdata->pages[i] = NULL; @@ -4387,92 +4389,20 @@ cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, return readpages_fill_pages(server, rdata, iter, iter->count); } -static int -readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - unsigned int rsize, struct list_head *tmplist, - unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +static void cifs_readahead(struct readahead_control *ractl) { - struct page *page, *tpage; - unsigned int expected_index; int rc; - gfp_t gfp = readahead_gfp_mask(mapping); - - INIT_LIST_HEAD(tmplist); - - page = lru_to_page(page_list); - - /* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. - */ - __SetPageLocked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, gfp); - - /* give up if we can't stick it in the cache */ - if (rc) { - __ClearPageLocked(page); - return rc; - } - - /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_SHIFT; - *bytes = PAGE_SIZE; - *nr_pages = 1; - list_move_tail(&page->lru, tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (*bytes + PAGE_SIZE > rsize) - break; - - __SetPageLocked(page); - rc = add_to_page_cache_locked(page, mapping, page->index, gfp); - if (rc) { - __ClearPageLocked(page); - break; - } - list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_SIZE; - expected_index++; - (*nr_pages)++; - } - return rc; -} - -static int cifs_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned num_pages) -{ - int rc; - int err = 0; - struct list_head tmplist; - struct cifsFileInfo *open_file = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifsFileInfo *open_file = ractl->file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; pid_t pid; - unsigned int xid; + unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0; + pgoff_t next_cached = ULONG_MAX; + bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) && + cifs_inode_cookie(ractl->mapping->host)->cache_priv; + bool check_cache = caching; xid = get_xid(); - /* - * Reads as many pages as possible from fscache. Returns -ENOBUFS - * immediately if the cookie is negative - * - * After this point, every page in the list might have PG_fscache set, - * so we will need to clean that up off of every page we don't use. - */ - rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, - &num_pages); - if (rc == 0) { - free_xid(xid); - return rc; - } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -4483,39 +4413,73 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", - __func__, file, mapping, num_pages); + __func__, ractl->file, ractl->mapping, readahead_count(ractl)); /* - * Start with the page at end of list and move it to private - * list. Do the same with any following pages until we hit - * the rsize limit, hit an index discontinuity, or run out of - * pages. Issue the async read and then start the loop again - * until the list is empty. - * - * Note that list order is important. The page_list is in - * the order of declining indexes. When we put the pages in - * the rdata->pages, then we want them in increasing order. + * Chop the readahead request up into rsize-sized read requests. */ - while (!list_empty(page_list) && !err) { - unsigned int i, nr_pages, bytes, rsize; - loff_t offset; - struct page *page, *tpage; + while ((nr_pages = readahead_count(ractl) - last_batch_size)) { + unsigned int i, got, rsize; + struct page *page; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; + pgoff_t index = readahead_index(ractl) + last_batch_size; + + /* + * Find out if we have anything cached in the range of + * interest, and if so, where the next chunk of cached data is. + */ + if (caching) { + if (check_cache) { + rc = cifs_fscache_query_occupancy( + ractl->mapping->host, index, nr_pages, + &next_cached, &cache_nr_pages); + if (rc < 0) + caching = false; + check_cache = false; + } + + if (index == next_cached) { + /* + * TODO: Send a whole batch of pages to be read + * by the cache. + */ + page = readahead_page(ractl); + last_batch_size = 1 << thp_order(page); + if (cifs_readpage_from_fscache(ractl->mapping->host, + page) < 0) { + /* + * TODO: Deal with cache read failure + * here, but for the moment, delegate + * that to readpage. + */ + caching = false; + } + unlock_page(page); + next_cached++; + cache_nr_pages--; + if (cache_nr_pages == 0) + check_cache = true; + continue; + } + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); - if (rc == -EAGAIN) - continue; - else if (rc) + if (rc) { + if (rc == -EAGAIN) + continue; break; + } } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, credits); if (rc) break; + nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); + nr_pages = min_t(size_t, nr_pages, next_cached - index); /* * Give up immediately if rsize is too small to read an entire @@ -4523,16 +4487,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * reach this point however since we set ra_pages to 0 when the * rsize is smaller than a cache page. */ - if (unlikely(rsize < PAGE_SIZE)) { - add_credits_and_wake_if(server, credits, 0); - free_xid(xid); - return 0; - } - - nr_pages = 0; - err = readpages_get_pages(mapping, page_list, rsize, &tmplist, - &nr_pages, &offset, &bytes); - if (!nr_pages) { + if (unlikely(!nr_pages)) { add_credits_and_wake_if(server, credits, 0); break; } @@ -4540,36 +4495,31 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ - list_for_each_entry_safe(page, tpage, &tmplist, lru) { - list_del(&page->lru); - lru_cache_add(page); - unlock_page(page); - put_page(page); - } - rc = -ENOMEM; add_credits_and_wake_if(server, credits, 0); break; } - rdata->cfile = cifsFileInfo_get(open_file); - rdata->server = server; - rdata->mapping = mapping; - rdata->offset = offset; - rdata->bytes = bytes; - rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; + got = __readahead_batch(ractl, rdata->pages, nr_pages); + if (got != nr_pages) { + pr_warn("__readahead_batch() returned %u/%u\n", + got, nr_pages); + nr_pages = got; + } + + rdata->nr_pages = nr_pages; + rdata->bytes = readahead_batch_length(ractl); + rdata->cfile = cifsFileInfo_get(open_file); + rdata->server = server; + rdata->mapping = ractl->mapping; + rdata->offset = readahead_pos(ractl); + rdata->pid = pid; + rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->copy_into_pages = cifs_readpages_copy_into_pages; - rdata->credits = credits_on_stack; - - list_for_each_entry_safe(page, tpage, &tmplist, lru) { - list_del(&page->lru); - rdata->pages[rdata->nr_pages++] = page; - } + rdata->credits = credits_on_stack; rc = adjust_credits(server, &rdata->credits, rdata->bytes); - if (!rc) { if (rdata->cfile->invalidHandle) rc = -EAGAIN; @@ -4581,7 +4531,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; - lru_cache_add(page); unlock_page(page); put_page(page); } @@ -4591,15 +4540,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } kref_put(&rdata->refcount, cifs_readdata_release); + last_batch_size = nr_pages; } - /* Any pages that have been shown to fscache but didn't get added to - * the pagecache must be uncached before they get returned to the - * allocator. - */ - cifs_fscache_readpages_cancel(mapping->host, page_list); free_xid(xid); - return rc; } /* @@ -4801,17 +4745,19 @@ static int cifs_release_page(struct page *page, gfp_t gfp) { if (PagePrivate(page)) return 0; - - return cifs_fscache_release_page(page, gfp); + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + } + fscache_note_page_release(cifs_inode_cookie(page->mapping->host)); + return true; } static void cifs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { - struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - - if (offset == 0 && length == PAGE_SIZE) - cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); + wait_on_page_fscache(page); } static int cifs_launder_page(struct page *page) @@ -4831,7 +4777,7 @@ static int cifs_launder_page(struct page *page) if (clear_page_dirty_for_io(page)) rc = cifs_writepage_locked(page, &wbc); - cifs_fscache_invalidate_page(page, page->mapping->host); + wait_on_page_fscache(page); return rc; } @@ -4921,7 +4867,7 @@ oplock_break_done: * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests * so this method should never be called. * - * Direct IO is not yet supported in the cached mode. + * Direct IO is not yet supported in the cached mode. */ static ssize_t cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter) @@ -4988,14 +4934,27 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } +/* + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. + */ +#ifdef CONFIG_CIFS_FSCACHE +static int cifs_set_page_dirty(struct page *page) +{ + return fscache_set_page_dirty(page, cifs_inode_cookie(page->mapping->host)); +} +#else +#define cifs_set_page_dirty __set_page_dirty_nobuffers +#endif + const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, - .readpages = cifs_readpages, + .readahead = cifs_readahead, .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = cifs_set_page_dirty, .releasepage = cifs_release_page, .direct_IO = cifs_direct_io, .invalidatepage = cifs_invalidate_page, @@ -5020,7 +4979,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = cifs_set_page_dirty, .releasepage = cifs_release_page, .invalidatepage = cifs_invalidate_page, .launder_page = cifs_launder_page, diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index e3ed25dc6f3f..a92e9eec521f 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -37,6 +37,8 @@ #include "rfc1002pdu.h" #include "fs_context.h" +static DEFINE_MUTEX(cifs_mount_mutex); + static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_20, SMB20_VERSION_STRING}, @@ -147,7 +149,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("handletimeout", Opt_handletimeout), - fsparam_u32("snapshot", Opt_snapshot), + fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), /* Mount options which take string value */ @@ -707,10 +709,14 @@ static int smb3_get_tree_common(struct fs_context *fc) static int smb3_get_tree(struct fs_context *fc) { int err = smb3_fs_context_validate(fc); + int ret; if (err) return err; - return smb3_get_tree_common(fc); + mutex_lock(&cifs_mount_mutex); + ret = smb3_get_tree_common(fc); + mutex_unlock(&cifs_mount_mutex); + return ret; } static void smb3_fs_context_free(struct fs_context *fc) @@ -1072,7 +1078,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->echo_interval = result.uint_32; break; case Opt_snapshot: - ctx->snapshot_time = result.uint_32; + ctx->snapshot_time = result.uint_64; break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 003c5f1f4dfb..33af72e0ac0c 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -12,332 +12,249 @@ #include "cifs_fs_sb.h" #include "cifsproto.h" -/* - * Key layout of CIFS server cache index object - */ -struct cifs_server_key { - __u64 conn_id; -} __packed; - -/* - * Get a cookie for a server object keyed by {IPaddress,port,family} tuple - */ -void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) -{ - struct cifs_server_key key; - - /* - * Check if cookie was already initialized so don't reinitialize it. - * In the future, as we integrate with newer fscache features, - * we may want to instead add a check if cookie has changed - */ - if (server->fscache) - return; - - memset(&key, 0, sizeof(key)); - key.conn_id = server->conn_id; - - server->fscache = - fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, - &key, sizeof(key), - NULL, 0, - server, 0, true); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server, server->fscache); -} - -void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +static void cifs_fscache_fill_volume_coherency( + struct cifs_tcon *tcon, + struct cifs_fscache_volume_coherency_data *cd) { - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server, server->fscache); - fscache_relinquish_cookie(server->fscache, NULL, false); - server->fscache = NULL; + memset(cd, 0, sizeof(*cd)); + cd->resource_id = cpu_to_le64(tcon->resource_id); + cd->vol_create_time = tcon->vol_create_time; + cd->vol_serial_number = cpu_to_le32(tcon->vol_serial_number); } -void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) +int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { + struct cifs_fscache_volume_coherency_data cd; struct TCP_Server_Info *server = tcon->ses->server; + struct fscache_volume *vcookie; + const struct sockaddr *sa = (struct sockaddr *)&server->dstaddr; + size_t slen, i; char *sharename; - struct cifs_fscache_super_auxdata auxdata; + char *key; + int ret = -ENOMEM; + + tcon->fscache = NULL; + switch (sa->sa_family) { + case AF_INET: + case AF_INET6: + break; + default: + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); + return -EINVAL; + } - /* - * Check if cookie was already initialized so don't reinitialize it. - * In the future, as we integrate with newer fscache features, - * we may want to instead add a check if cookie has changed - */ - if (tcon->fscache) - return; + memset(&key, 0, sizeof(key)); sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - tcon->fscache = NULL; - return; + return -EINVAL; + } + + slen = strlen(sharename); + for (i = 0; i < slen; i++) + if (sharename[i] == '/') + sharename[i] = ';'; + + key = kasprintf(GFP_KERNEL, "cifs,%pISpc,%s", sa, sharename); + if (!key) + goto out; + + cifs_fscache_fill_volume_coherency(tcon, &cd); + vcookie = fscache_acquire_volume(key, + NULL, /* preferred_cache */ + &cd, sizeof(cd)); + cifs_dbg(FYI, "%s: (%s/0x%p)\n", __func__, key, vcookie); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + ret = PTR_ERR(vcookie); + goto out_2; + } + pr_err("Cache volume key already in use (%s)\n", key); + vcookie = NULL; } - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; - - tcon->fscache = - fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, - sharename, strlen(sharename), - &auxdata, sizeof(auxdata), - tcon, 0, true); + tcon->fscache = vcookie; + ret = 0; +out_2: + kfree(key); +out: kfree(sharename); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server->fscache, tcon->fscache); + return ret; } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - struct cifs_fscache_super_auxdata auxdata; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; + struct cifs_fscache_volume_coherency_data cd; cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); - fscache_relinquish_cookie(tcon->fscache, &auxdata, false); - tcon->fscache = NULL; -} - -static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi, - struct cifs_tcon *tcon) -{ - struct cifs_fscache_inode_auxdata auxdata; - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - cifsi->fscache = - fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, - &cifsi->uniqueid, sizeof(cifsi->uniqueid), - &auxdata, sizeof(auxdata), - cifsi, cifsi->vfs_inode.i_size, true); + cifs_fscache_fill_volume_coherency(tcon, &cd); + fscache_relinquish_volume(tcon->fscache, &cd, false); + tcon->fscache = NULL; } -static void cifs_fscache_enable_inode_cookie(struct inode *inode) +void cifs_fscache_get_inode_cookie(struct inode *inode) { + struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - if (cifsi->fscache) - return; - - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)) - return; - - cifs_fscache_acquire_inode_cookie(cifsi, tcon); + cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", - __func__, tcon->fscache, cifsi->fscache); + cifsi->fscache = + fscache_acquire_cookie(tcon->fscache, 0, + &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &cd, sizeof(cd), + i_size_read(&cifsi->vfs_inode)); } -void cifs_fscache_release_inode_cookie(struct inode *inode) +void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) { - struct cifs_fscache_inode_auxdata auxdata; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - if (cifsi->fscache) { - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - /* fscache_relinquish_cookie does not seem to update auxdata */ - fscache_update_cookie(cifsi->fscache, &auxdata); - fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); - cifsi->fscache = NULL; + if (update) { + struct cifs_fscache_inode_coherency_data cd; + loff_t i_size = i_size_read(inode); + + cifs_fscache_fill_coherency(inode, &cd); + fscache_unuse_cookie(cifs_inode_cookie(inode), &cd, &i_size); + } else { + fscache_unuse_cookie(cifs_inode_cookie(inode), NULL, NULL); } } -void cifs_fscache_update_inode_cookie(struct inode *inode) +void cifs_fscache_release_inode_cookie(struct inode *inode) { - struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_update_cookie(cifsi->fscache, &auxdata); - } -} - -void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ - cifs_fscache_enable_inode_cookie(inode); -} - -void cifs_fscache_reset_inode_cookie(struct inode *inode) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct fscache_cookie *old = cifsi->fscache; - - if (cifsi->fscache) { - /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(cifsi->fscache, NULL, true); - - cifs_fscache_acquire_inode_cookie(cifsi, tcon); - cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", - __func__, cifsi->fscache, old); + fscache_relinquish_cookie(cifsi->fscache, false); + cifsi->fscache = NULL; } } -int cifs_fscache_release_page(struct page *page, gfp_t gfp) +static inline void fscache_end_operation(struct netfs_cache_resources *cres) { - if (PageFsCache(page)) { - struct inode *inode = page->mapping->host; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, page, cifsi->fscache); - if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) - return 0; - } + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - return 1; -} - -static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, - int error) -{ - cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error); - if (!error) - SetPageUptodate(page); - unlock_page(page); + if (ops) + ops->end_operation(cres); } /* - * Retrieve a page from FS-Cache + * Fallback page reading interface. */ -int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +static int fscache_fallback_read_page(struct inode *inode, struct page *page) { + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; int ret; - cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", - __func__, CIFS_I(inode)->fscache, page, inode); - ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, - cifs_readpage_from_fscache_complete, - NULL, - GFP_KERNEL); - switch (ret) { - - case 0: /* page found in fscache, read submitted */ - cifs_dbg(FYI, "%s: submitted\n", __func__); + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) return ret; - case -ENOBUFS: /* page won't be cached */ - case -ENODATA: /* page not in cache */ - cifs_dbg(FYI, "%s: %d\n", __func__, ret); - return 1; - default: - cifs_dbg(VFS, "unknown error ret = %d\n", ret); - } + ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, + NULL, NULL); + fscache_end_operation(&cres); return ret; } /* - * Retrieve a set of pages from FS-Cache + * Fallback page writing interface. */ -int __cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +static int fscache_fallback_write_page(struct inode *inode, struct page *page, + bool no_space_allocated_yet) { + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + loff_t start = page_offset(page); + size_t len = PAGE_SIZE; int ret; - cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", - __func__, CIFS_I(inode)->fscache, *nr_pages, inode); - ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, - pages, nr_pages, - cifs_readpage_from_fscache_complete, - NULL, - mapping_gfp_mask(mapping)); - switch (ret) { - case 0: /* read submitted to the cache for all pages */ - cifs_dbg(FYI, "%s: submitted\n", __func__); - return ret; - - case -ENOBUFS: /* some pages are not cached and can't be */ - case -ENODATA: /* some pages are not cached */ - cifs_dbg(FYI, "%s: no page\n", __func__); - return 1; + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); - default: - cifs_dbg(FYI, "unknown error ret = %d\n", ret); - } + ret = fscache_begin_write_operation(&cres, cookie); + if (ret < 0) + return ret; + ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + no_space_allocated_yet); + if (ret == 0) + ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + fscache_end_operation(&cres); return ret; } -void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); int ret; - WARN_ON(!cifsi->fscache); + cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", + __func__, cifs_inode_cookie(inode), page, inode); - cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, cifsi->fscache, page, inode); - ret = fscache_write_page(cifsi->fscache, page, - cifsi->vfs_inode.i_size, GFP_KERNEL); - if (ret != 0) - fscache_uncache_page(cifsi->fscache, page); -} + ret = fscache_fallback_read_page(inode, page); + if (ret < 0) + return ret; -void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) -{ - cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", - __func__, CIFS_I(inode)->fscache, inode); - fscache_readpages_cancel(CIFS_I(inode)->fscache, pages); + /* Read completed synchronously */ + SetPageUptodate(page); + return 0; } -void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; + cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", + __func__, cifs_inode_cookie(inode), page, inode); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_wait_on_page_write(cookie, page); - fscache_uncache_page(cookie, page); + fscache_fallback_write_page(inode, page, true); } -void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page) +/* + * Query the cache occupancy. + */ +int __cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = cifs_inode_cookie(inode); + loff_t start, data_start; + size_t len, data_len; + int ret; - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_wait_on_page_write(cookie, page); -} + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; -void __cifs_fscache_uncache_page(struct inode *inode, struct page *page) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; + start = first * PAGE_SIZE; + len = nr_pages * PAGE_SIZE; + ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE, + &data_start, &data_len); + if (ret == 0) { + *_data_first = data_start / PAGE_SIZE; + *_data_nr_pages = len / PAGE_SIZE; + } - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_uncache_page(cookie, page); + fscache_end_operation(&cres); + return ret; } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 9baa1d0f22bd..55129908e2c1 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -9,173 +9,154 @@ #ifndef _CIFS_FSCACHE_H #define _CIFS_FSCACHE_H +#include <linux/swap.h> #include <linux/fscache.h> #include "cifsglob.h" -#ifdef CONFIG_CIFS_FSCACHE - /* - * Auxiliary data attached to CIFS superblock within the cache + * Coherency data attached to CIFS volume within the cache */ -struct cifs_fscache_super_auxdata { - u64 resource_id; /* unique server resource id */ +struct cifs_fscache_volume_coherency_data { + __le64 resource_id; /* unique server resource id */ __le64 vol_create_time; - u32 vol_serial_number; + __le32 vol_serial_number; } __packed; /* - * Auxiliary data attached to CIFS inode within the cache + * Coherency data attached to CIFS inode within the cache. */ -struct cifs_fscache_inode_auxdata { - u64 last_write_time_sec; - u64 last_change_time_sec; - u32 last_write_time_nsec; - u32 last_change_time_nsec; - u64 eof; +struct cifs_fscache_inode_coherency_data { + __le64 last_write_time_sec; + __le64 last_change_time_sec; + __le32 last_write_time_nsec; + __le32 last_change_time_nsec; }; -/* - * cache.c - */ -extern struct fscache_netfs cifs_fscache_netfs; -extern const struct fscache_cookie_def cifs_fscache_server_index_def; -extern const struct fscache_cookie_def cifs_fscache_super_index_def; -extern const struct fscache_cookie_def cifs_fscache_inode_object_def; - -extern int cifs_fscache_register(void); -extern void cifs_fscache_unregister(void); +#ifdef CONFIG_CIFS_FSCACHE /* * fscache.c */ -extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); -extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); -extern void cifs_fscache_get_super_cookie(struct cifs_tcon *); +extern int cifs_fscache_get_super_cookie(struct cifs_tcon *); extern void cifs_fscache_release_super_cookie(struct cifs_tcon *); +extern void cifs_fscache_get_inode_cookie(struct inode *inode); extern void cifs_fscache_release_inode_cookie(struct inode *); -extern void cifs_fscache_update_inode_cookie(struct inode *inode); -extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void cifs_fscache_reset_inode_cookie(struct inode *); - -extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); -extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page); -extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page); -extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); -extern int __cifs_readpage_from_fscache(struct inode *, struct page *); -extern int __cifs_readpages_from_fscache(struct inode *, - struct address_space *, - struct list_head *, - unsigned *); -extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *); - -extern void __cifs_readpage_to_fscache(struct inode *, struct page *); - -static inline void cifs_fscache_invalidate_page(struct page *page, - struct inode *inode) +extern void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update); + +static inline +void cifs_fscache_fill_coherency(struct inode *inode, + struct cifs_fscache_inode_coherency_data *cd) { - if (PageFsCache(page)) - __cifs_fscache_invalidate_page(page, inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + memset(cd, 0, sizeof(*cd)); + cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); } -static inline void cifs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) + +static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - if (PageFsCache(page)) - __cifs_fscache_wait_on_page_write(inode, page); + return CIFS_I(inode)->fscache; } -static inline void cifs_fscache_uncache_page(struct inode *inode, - struct page *page) +static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) { - if (PageFsCache(page)) - __cifs_fscache_uncache_page(inode, page); + struct cifs_fscache_inode_coherency_data cd; + + cifs_fscache_fill_coherency(inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, + i_size_read(inode), flags); } -static inline int cifs_readpage_from_fscache(struct inode *inode, - struct page *page) -{ - if (CIFS_I(inode)->fscache) - return __cifs_readpage_from_fscache(inode, page); +extern int __cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages); - return -ENOBUFS; +static inline int cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) +{ + if (!cifs_inode_cookie(inode)) + return -ENOBUFS; + return __cifs_fscache_query_occupancy(inode, first, nr_pages, + _data_first, _data_nr_pages); } -static inline int cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage); +extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage); + + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) { - if (CIFS_I(inode)->fscache) - return __cifs_readpages_from_fscache(inode, mapping, pages, - nr_pages); + if (cifs_inode_cookie(inode)) + return __cifs_readpage_from_fscache(inode, page); return -ENOBUFS; } static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) { - if (PageFsCache(page)) + if (cifs_inode_cookie(inode)) __cifs_readpage_to_fscache(inode, page); } -static inline void cifs_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) { - if (CIFS_I(inode)->fscache) - return __cifs_fscache_readpages_cancel(inode, pages); + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + fscache_note_page_release(cifs_inode_cookie(page->mapping->host)); + } + return true; } #else /* CONFIG_CIFS_FSCACHE */ -static inline int cifs_fscache_register(void) { return 0; } -static inline void cifs_fscache_unregister(void) {} - -static inline void -cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} -static inline void -cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} -static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {} -static inline void -cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} - -static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {} -static inline void cifs_fscache_set_inode_cookie(struct inode *inode, - struct file *filp) {} -static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} -static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +static inline +void cifs_fscache_fill_coherency(struct inode *inode, + struct cifs_fscache_inode_coherency_data *cd) { - return 1; /* May release page */ } -static inline void cifs_fscache_invalidate_page(struct page *page, - struct inode *inode) {} -static inline void cifs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) {} -static inline void cifs_fscache_uncache_page(struct inode *inode, - struct page *page) {} +static inline int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { return 0; } +static inline void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} -static inline int -cifs_readpage_from_fscache(struct inode *inode, struct page *page) +static inline void cifs_fscache_get_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {} +static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } +static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} + +static inline int cifs_fscache_query_occupancy(struct inode *inode, + pgoff_t first, unsigned int nr_pages, + pgoff_t *_data_first, + unsigned int *_data_nr_pages) { + *_data_first = ULONG_MAX; + *_data_nr_pages = 0; return -ENOBUFS; } -static inline int cifs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline void cifs_readpage_to_fscache(struct inode *inode, - struct page *page) {} +static inline +void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} -static inline void cifs_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { + return true; /* May release page */ } #endif /* CONFIG_CIFS_FSCACHE */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 279622e4eb1c..60d853c92f6a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -83,6 +83,7 @@ static void cifs_set_ops(struct inode *inode) static void cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { + struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifs_i = CIFS_I(inode); cifs_dbg(FYI, "%s: revalidating inode %llu\n", @@ -113,6 +114,9 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", __func__, cifs_i->uniqueid); set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); + /* Invalidate fscache cookie */ + cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); } /* @@ -952,6 +956,12 @@ cifs_get_inode_info(struct inode **inode, rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, tmp_data, &adjust_tz, &is_reparse_point); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (rc == -ENOENT && is_tcon_dfs(tcon)) + rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, + cifs_sb, + full_path); +#endif data = tmp_data; } @@ -1298,10 +1308,7 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; -#ifdef CONFIG_CIFS_FSCACHE - /* initialize per-inode cache cookie pointer */ - CIFS_I(inode)->fscache = NULL; -#endif + cifs_fscache_get_inode_cookie(inode); unlock_new_inode(inode); } } @@ -1370,6 +1377,7 @@ iget_no_retry: iget_failed(inode); inode = ERR_PTR(rc); } + out: kfree(path); free_xid(xid); @@ -2266,7 +2274,6 @@ cifs_invalidate_mapping(struct inode *inode) __func__, inode); } - cifs_fscache_reset_inode_cookie(inode); return rc; } @@ -2771,8 +2778,10 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) goto out; if ((attrs->ia_valid & ATTR_SIZE) && - attrs->ia_size != i_size_read(inode)) + attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); + } setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); @@ -2967,8 +2976,10 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; if ((attrs->ia_valid & ATTR_SIZE) && - attrs->ia_size != i_size_read(inode)) + attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); + } setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 5148d48d6a35..56598f7dbe00 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1302,4 +1302,53 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; return 0; } + +/** cifs_dfs_query_info_nonascii_quirk + * Handle weird Windows SMB server behaviour. It responds with + * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request + * for "\<server>\<dfsname>\<linkpath>" DFS reference, + * where <dfsname> contains non-ASCII unicode symbols. + * + * Check such DFS reference and emulate -ENOENT if it is actual. + */ +int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *linkpath) +{ + char *treename, *dfspath, sep; + int treenamelen, linkpathlen, rc; + + treename = tcon->treeName; + /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL + * messages MUST be encoded with exactly one leading backslash, not two + * leading backslashes. + */ + sep = CIFS_DIR_SEP(cifs_sb); + if (treename[0] == sep && treename[1] == sep) + treename++; + linkpathlen = strlen(linkpath); + treenamelen = strnlen(treename, MAX_TREE_SIZE + 1); + dfspath = kzalloc(treenamelen + linkpathlen + 1, GFP_KERNEL); + if (!dfspath) + return -ENOMEM; + if (treenamelen) + memcpy(dfspath, treename, treenamelen); + memcpy(dfspath + treenamelen, linkpath, linkpathlen); + rc = dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), dfspath, NULL, NULL); + if (rc == 0) { + cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n", + dfspath); + rc = -EREMOTE; + } else if (rc == -EEXIST) { + cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n", + dfspath); + rc = -ENOENT; + } else { + cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc); + } + kfree(dfspath); + return rc; +} #endif diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index fa9fbd6a819c..ebe236b9d9f5 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -896,10 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); - spin_lock(&GlobalMid_Lock); - if (mid->server->tcpStatus != CifsExiting) - mid->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + cifs_reconnect(mid->server, false); } } diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index fe707f45da89..298458404252 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -40,7 +40,7 @@ #define NTLMSSP_REQUEST_NON_NT_KEY 0x400000 #define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 /* #define reserved4 0x1000000 */ -#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */ +#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we only set for SMB2+ */ /* #define reserved3 0x4000000 */ /* #define reserved2 0x8000000 */ /* #define reserved1 0x10000000 */ @@ -87,6 +87,30 @@ typedef struct _NEGOTIATE_MESSAGE { /* followed by WorkstationString */ } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; +#define NTLMSSP_REVISION_W2K3 0x0F + +/* See MS-NLMP section 2.2.2.10 */ +struct ntlmssp_version { + __u8 ProductMajorVersion; + __u8 ProductMinorVersion; + __le16 ProductBuild; /* we send the cifs.ko module version here */ + __u8 Reserved[3]; + __u8 NTLMRevisionCurrent; /* currently 0x0F */ +} __packed; + +/* see MS-NLMP section 2.2.1.1 */ +struct negotiate_message { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmNegotiate = 1 */ + __le32 NegotiateFlags; + SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ + SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ + struct ntlmssp_version Version; + /* SECURITY_BUFFER */ + char DomainString[0]; + /* followed by WorkstationString */ +} __packed; + typedef struct _CHALLENGE_MESSAGE { __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; __le32 MessageType; /* NtLmChallenge = 2 */ @@ -121,7 +145,13 @@ typedef struct _AUTHENTICATE_MESSAGE { int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses); int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, + const struct nls_table *nls_cp); +int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, u16 *buflen, + struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 035dc3e245dc..32f478c7a66d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -17,6 +17,8 @@ #include "nterr.h" #include <linux/utsname.h> #include <linux/slab.h> +#include <linux/version.h> +#include "cifsfs.h" #include "cifs_spnego.h" #include "smb2proto.h" #include "fs_context.h" @@ -65,6 +67,55 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) return false; } +/* channel helper functions. assumed that chan_lock is held by caller. */ + +unsigned int +cifs_ses_get_chan_index(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int i; + + for (i = 0; i < ses->chan_count; i++) { + if (ses->chans[i].server == server) + return i; + } + + /* If we didn't find the channel, it is likely a bug */ + WARN_ON(1); + return 0; +} + +void +cifs_chan_set_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + set_bit(chan_index, &ses->chans_need_reconnect); + cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n", + chan_index, ses->chans_need_reconnect); +} + +void +cifs_chan_clear_need_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + clear_bit(chan_index, &ses->chans_need_reconnect); + cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n", + chan_index, ses->chans_need_reconnect); +} + +bool +cifs_chan_needs_reconnect(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); +} + /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { @@ -76,21 +127,22 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) struct cifs_server_iface *ifaces = NULL; size_t iface_count; - if (ses->server->dialect < SMB30_PROT_ID) { - cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); - return 0; - } - spin_lock(&ses->chan_lock); new_chan_count = old_chan_count = ses->chan_count; left = ses->chan_max - ses->chan_count; if (left <= 0) { + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "ses already at max_channels (%zu), nothing to open\n", ses->chan_max); + return 0; + } + + if (ses->server->dialect < SMB30_PROT_ID) { spin_unlock(&ses->chan_lock); + cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); return 0; } @@ -261,9 +313,8 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, chan_server = cifs_get_tcp_session(&ctx, ses->server); - mutex_lock(&ses->session_mutex); spin_lock(&ses->chan_lock); - chan = ses->binding_chan = &ses->chans[ses->chan_count]; + chan = &ses->chans[ses->chan_count]; chan->server = chan_server; if (IS_ERR(chan->server)) { rc = PTR_ERR(chan->server); @@ -271,8 +322,15 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, spin_unlock(&ses->chan_lock); goto out; } + ses->chan_count++; + atomic_set(&ses->chan_seq, 0); + + /* Mark this channel as needing connect/setup */ + cifs_chan_set_need_reconnect(ses, chan->server); + spin_unlock(&ses->chan_lock); + mutex_lock(&ses->session_mutex); /* * We need to allocate the server crypto now as we will need * to sign packets before we generate the channel signing key @@ -281,37 +339,29 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, rc = smb311_crypto_shash_allocate(chan->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + mutex_unlock(&ses->session_mutex); goto out; } - ses->binding = true; - rc = cifs_negotiate_protocol(xid, ses); - if (rc) - goto out; - - rc = cifs_setup_session(xid, ses, cifs_sb->local_nls); - if (rc) - goto out; + rc = cifs_negotiate_protocol(xid, ses, chan->server); + if (!rc) + rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls); - /* success, put it on the list - * XXX: sharing ses between 2 tcp servers is not possible, the - * way "internal" linked lists works in linux makes element - * only able to belong to one list - * - * the binding session is already established so the rest of - * the code should be able to look it up, no need to add the - * ses to the new server. - */ - - spin_lock(&ses->chan_lock); - ses->chan_count++; - atomic_set(&ses->chan_seq, 0); - spin_unlock(&ses->chan_lock); + mutex_unlock(&ses->session_mutex); out: - ses->binding = false; - ses->binding_chan = NULL; - mutex_unlock(&ses->session_mutex); + if (rc && chan->server) { + spin_lock(&ses->chan_lock); + /* we rely on all bits beyond chan_count to be clear */ + cifs_chan_clear_need_reconnect(ses, chan->server); + ses->chan_count--; + /* + * chan_count should never reach 0 as at least the primary + * channel is always allocated + */ + WARN_ON(ses->chan_count < 1); + spin_unlock(&ses->chan_lock); + } if (rc && chan->server) cifs_put_tcp_session(chan->server, 0); @@ -319,20 +369,9 @@ out: return rc; } -/* Mark all session channels for reconnect */ -void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) -{ - int i; - - for (i = 0; i < ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (ses->chans[i].server->tcpStatus != CifsExiting) - ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } -} - -static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) +static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, + struct TCP_Server_Info *server, + SESSION_SETUP_ANDX *pSMB) { __u32 capabilities = 0; @@ -345,7 +384,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); - pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq); pSMB->req.VcNumber = cpu_to_le16(1); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -356,7 +395,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; - if (ses->server->sign) + if (server->sign) pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_UNICODE) { @@ -675,7 +714,11 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size) else sz += sizeof(__le16); - sz += sizeof(__le16) * strnlen(ses->workstation_name, CIFS_MAX_WORKSTATION_LEN); + if (ses->workstation_name) + sz += sizeof(__le16) * strnlen(ses->workstation_name, + CIFS_MAX_WORKSTATION_LEN); + else + sz += sizeof(__le16); return sz; } @@ -719,10 +762,10 @@ static inline void cifs_security_buffer_from_str(SECURITY_BUFFER *pbuf, int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp) { int rc = 0; - struct TCP_Server_Info *server = cifs_ses_server(ses); NEGOTIATE_MESSAGE *sec_blob; __u32 flags; unsigned char *tmp; @@ -773,9 +816,78 @@ setup_ntlm_neg_ret: return rc; } +/* + * Build ntlmssp blob with additional fields, such as version, + * supported by modern servers. For safety limit to SMB3 or later + * See notes in MS-NLMP Section 2.2.2.1 e.g. + */ +int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, + u16 *buflen, + struct cifs_ses *ses, + struct TCP_Server_Info *server, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct negotiate_message *sec_blob; + __u32 flags; + unsigned char *tmp; + int len; + + len = size_of_ntlmssp_blob(ses, sizeof(struct negotiate_message)); + *pbuffer = kmalloc(len, GFP_KERNEL); + if (!*pbuffer) { + rc = -ENOMEM; + cifs_dbg(VFS, "Error %d during NTLMSSP allocation\n", rc); + *buflen = 0; + goto setup_ntlm_smb3_neg_ret; + } + sec_blob = (struct negotiate_message *)*pbuffer; + + memset(*pbuffer, 0, sizeof(struct negotiate_message)); + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmNegotiate; + + /* BB is NTLMV2 session security format easier to use here? */ + flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | + NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL | + NTLMSSP_NEGOTIATE_SIGN | NTLMSSP_NEGOTIATE_VERSION; + if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + + sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR; + sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL; + sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD); + sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3; + + tmp = *pbuffer + sizeof(struct negotiate_message); + ses->ntlmssp->client_flags = flags; + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + /* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */ + cifs_security_buffer_from_str(&sec_blob->DomainName, + NULL, + CIFS_MAX_DOMAINNAME_LEN, + *pbuffer, &tmp, + nls_cp); + + cifs_security_buffer_from_str(&sec_blob->WorkstationName, + NULL, + CIFS_MAX_WORKSTATION_LEN, + *pbuffer, &tmp, + nls_cp); + + *buflen = tmp - *pbuffer; +setup_ntlm_smb3_neg_ret: + return rc; +} + + int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp) { int rc; @@ -912,6 +1024,7 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) struct sess_data { unsigned int xid; struct cifs_ses *ses; + struct TCP_Server_Info *server; struct nls_table *nls_cp; void (*func)(struct sess_data *); int result; @@ -978,31 +1091,27 @@ static int sess_establish_session(struct sess_data *sess_data) { struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; - mutex_lock(&ses->server->srv_mutex); - if (!ses->server->session_estab) { - if (ses->server->sign) { - ses->server->session_key.response = + mutex_lock(&server->srv_mutex); + if (!server->session_estab) { + if (server->sign) { + server->session_key.response = kmemdup(ses->auth_key.response, ses->auth_key.len, GFP_KERNEL); - if (!ses->server->session_key.response) { - mutex_unlock(&ses->server->srv_mutex); + if (!server->session_key.response) { + mutex_unlock(&server->srv_mutex); return -ENOMEM; } - ses->server->session_key.len = + server->session_key.len = ses->auth_key.len; } - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; + server->sequence_number = 0x2; + server->session_estab = true; } - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); - return 0; } @@ -1036,6 +1145,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) SESSION_SETUP_ANDX *pSMB; char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u32 capabilities; __u16 bytes_remaining; @@ -1047,7 +1157,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + capabilities = cifs_ssetup_hdr(ses, server, pSMB); pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); @@ -1145,6 +1255,7 @@ sess_auth_kerberos(struct sess_data *sess_data) SESSION_SETUP_ANDX *pSMB; char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u32 capabilities; __u16 bytes_remaining; struct key *spnego_key = NULL; @@ -1159,9 +1270,9 @@ sess_auth_kerberos(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + capabilities = cifs_ssetup_hdr(ses, server, pSMB); - spnego_key = cifs_get_spnego_key(ses); + spnego_key = cifs_get_spnego_key(ses, server); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); spnego_key = NULL; @@ -1285,12 +1396,13 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) { SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u32 capabilities; char *bcc_ptr; pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + capabilities = cifs_ssetup_hdr(ses, server, pSMB); if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); return -ENOSYS; @@ -1324,6 +1436,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) struct smb_hdr *smb_buf; SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u16 bytes_remaining; char *bcc_ptr; unsigned char *ntlmsspblob = NULL; @@ -1351,10 +1464,10 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) /* Build security blob before we assemble the request */ rc = build_ntlmssp_negotiate_blob(&ntlmsspblob, - &blob_len, ses, + &blob_len, ses, server, sess_data->nls_cp); if (rc) - goto out; + goto out_free_ntlmsspblob; sess_data->iov[1].iov_len = blob_len; sess_data->iov[1].iov_base = ntlmsspblob; @@ -1362,7 +1475,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = _sess_auth_rawntlmssp_assemble_req(sess_data); if (rc) - goto out; + goto out_free_ntlmsspblob; rc = sess_sendreceive(sess_data); @@ -1376,14 +1489,14 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = 0; if (rc) - goto out; + goto out_free_ntlmsspblob; cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto out; + goto out_free_ntlmsspblob; } ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ @@ -1397,10 +1510,13 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) cifs_dbg(VFS, "bad security blob length %d\n", blob_len); rc = -EINVAL; - goto out; + goto out_free_ntlmsspblob; } rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); + +out_free_ntlmsspblob: + kfree(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1426,6 +1542,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) struct smb_hdr *smb_buf; SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; __u16 bytes_remaining; char *bcc_ptr; unsigned char *ntlmsspblob = NULL; @@ -1442,7 +1559,8 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; smb_buf = (struct smb_hdr *)pSMB; rc = build_ntlmssp_auth_blob(&ntlmsspblob, - &blob_len, ses, sess_data->nls_cp); + &blob_len, ses, server, + sess_data->nls_cp); if (rc) goto out_free_ntlmsspblob; sess_data->iov[1].iov_len = blob_len; @@ -1513,7 +1631,7 @@ out_free_ntlmsspblob: out: sess_free_buffer(sess_data); - if (!rc) + if (!rc) rc = sess_establish_session(sess_data); /* Cleanup */ @@ -1526,11 +1644,13 @@ out: sess_data->result = rc; } -static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +static int select_sec(struct sess_data *sess_data) { int type; + struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; - type = cifs_select_sectype(ses->server, ses->sectype); + type = cifs_select_sectype(server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, "Unable to select appropriate authentication method!\n"); @@ -1561,7 +1681,8 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) } int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) + struct TCP_Server_Info *server, + const struct nls_table *nls_cp) { int rc = 0; struct sess_data *sess_data; @@ -1575,15 +1696,16 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, if (!sess_data) return -ENOMEM; - rc = select_sec(ses, sess_data); - if (rc) - goto out; - sess_data->xid = xid; sess_data->ses = ses; + sess_data->server = server; sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; + rc = select_sec(sess_data); + if (rc) + goto out; + while (sess_data->func) sess_data->func(sess_data); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3b83839fc2c2..b2fb7bd11936 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -7,6 +7,7 @@ #include <linux/pagemap.h> #include <linux/vfs.h> +#include <uapi/linux/magic.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" @@ -163,7 +164,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) { __u64 mid = 0; __u16 last_mid, cur_mid; - bool collision; + bool collision, reconnect = false; spin_lock(&GlobalMid_Lock); @@ -215,7 +216,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) * an eventual reconnect to clean out the pending_mid_q. */ if (num_mids > 32768) - server->tcpStatus = CifsNeedReconnect; + reconnect = true; if (!collision) { mid = (__u64)cur_mid; @@ -225,6 +226,11 @@ cifs_get_next_mid(struct TCP_Server_Info *server) cur_mid++; } spin_unlock(&GlobalMid_Lock); + + if (reconnect) { + cifs_mark_tcp_ses_conns_for_reconnect(server, false); + } + return mid; } @@ -414,14 +420,16 @@ cifs_need_neg(struct TCP_Server_Info *server) } static int -cifs_negotiate(const unsigned int xid, struct cifs_ses *ses) +cifs_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { int rc; - rc = CIFSSMBNegotiate(xid, ses); + rc = CIFSSMBNegotiate(xid, ses, server); if (rc == -EAGAIN) { /* retry only once on 1st time connection */ - set_credits(ses->server, 1); - rc = CIFSSMBNegotiate(xid, ses); + set_credits(server, 1); + rc = CIFSSMBNegotiate(xid, ses, server); if (rc == -EAGAIN) rc = -EHOSTDOWN; } @@ -878,7 +886,7 @@ cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon, { int rc = -EOPNOTSUPP; - buf->f_type = CIFS_MAGIC_NUMBER; + buf->f_type = CIFS_SUPER_MAGIC; /* * We could add a second check for a QFS Unix capability bit diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index ca692b2283cd..4125fd113cfb 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -13,8 +13,6 @@ #ifndef _SMB2_GLOB_H #define _SMB2_GLOB_H -#define SMB2_MAGIC_NUMBER 0xFE534D42 - /* ***************************************************************** * Constants go here diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index cdcdef32759e..b25623e3fe3d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -847,16 +847,17 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve * SMB2 header. * * @ses: server session structure + * @server: pointer to server info * @iov: array containing the SMB request we will send to the server * @nvec: number of array entries for the iov */ int -smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) +smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, + struct kvec *iov, int nvec) { int i, rc; struct sdesc *d; struct smb2_hdr *hdr; - struct TCP_Server_Info *server = cifs_ses_server(ses); hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c5b1dea54ebc..af5d0830bc8a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -13,6 +13,7 @@ #include <linux/sort.h> #include <crypto/aead.h> #include <linux/fiemap.h> +#include <uapi/linux/magic.h> #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" @@ -121,9 +122,13 @@ smb2_add_credits(struct TCP_Server_Info *server, optype, scredits, add); } + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedReconnect - || server->tcpStatus == CifsExiting) + || server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return; + } + spin_unlock(&cifs_tcp_ses_lock); switch (rc) { case -1: @@ -208,11 +213,15 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, return rc; spin_lock(&server->req_lock); } else { + spin_unlock(&server->req_lock); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&server->req_lock); + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; } + spin_unlock(&cifs_tcp_ses_lock); + spin_lock(&server->req_lock); scredits = server->credits; /* can deadlock with reopen */ if (scredits <= 8) { @@ -384,14 +393,16 @@ smb2_need_neg(struct TCP_Server_Info *server) } static int -smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) +smb2_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { int rc; spin_lock(&GlobalMid_Lock); - cifs_ses_server(ses)->CurrentMid = 0; + server->CurrentMid = 0; spin_unlock(&GlobalMid_Lock); - rc = SMB2_negotiate(xid, ses); + rc = SMB2_negotiate(xid, ses, server); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) rc = -EHOSTDOWN; @@ -2747,7 +2758,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, goto qfs_exit; rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - buf->f_type = SMB2_MAGIC_NUMBER; + buf->f_type = SMB2_SUPER_MAGIC; info = (struct smb2_fs_full_size_info *)( le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), @@ -2789,7 +2800,7 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB311_posix_qfs_info(xid, tcon, fid.persistent_fid, fid.volatile_fid, buf); - buf->f_type = SMB2_MAGIC_NUMBER; + buf->f_type = SMB2_SUPER_MAGIC; SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); return rc; } @@ -4808,7 +4819,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (server->ops->is_session_expired && server->ops->is_session_expired(buf)) { if (!is_offloaded) - cifs_reconnect(server); + cifs_reconnect(server, true); return -1; } @@ -4981,10 +4992,12 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->callback(mid); } else { + spin_lock(&cifs_tcp_ses_lock); spin_lock(&GlobalMid_Lock); if (dw->server->tcpStatus == CifsNeedReconnect) { mid->mid_state = MID_RETRY_NEEDED; spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); mid->callback(mid); } else { mid->mid_state = MID_REQUEST_SUBMITTED; @@ -4992,6 +5005,7 @@ static void smb2_decrypt_offload(struct work_struct *work) list_add_tail(&mid->qhead, &dw->server->pending_mid_q); spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_tcp_ses_lock); } } cifs_mid_q_entry_release(mid); @@ -5221,13 +5235,13 @@ smb3_receive_transform(struct TCP_Server_Info *server, sizeof(struct smb2_hdr)) { cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); - cifs_reconnect(server); + cifs_reconnect(server, true); return -ECONNABORTED; } if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) { cifs_server_dbg(VFS, "Transform message is broken\n"); - cifs_reconnect(server); + cifs_reconnect(server, true); return -ECONNABORTED; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8b3670388cda..7e7909b1ae11 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -162,6 +162,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) return 0; + spin_lock(&cifs_tcp_ses_lock); if (tcon->tidStatus == CifsExiting) { /* * only tree disconnect, open, and write, @@ -171,11 +172,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if ((smb2_command != SMB2_WRITE) && (smb2_command != SMB2_CREATE) && (smb2_command != SMB2_TREE_DISCONNECT)) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "can not send cmd %d while umounting\n", smb2_command); return -ENODEV; } } + spin_unlock(&cifs_tcp_ses_lock); if ((!tcon->ses) || (tcon->ses->status == CifsExiting) || (!tcon->ses->server) || !server) return -EIO; @@ -214,8 +217,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } /* are we still trying to reconnect? */ - if (server->tcpStatus != CifsNeedReconnect) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); break; + } + spin_unlock(&cifs_tcp_ses_lock); if (retries && --retries) continue; @@ -232,64 +239,74 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, retries = server->nr_targets; } - if (!tcon->ses->need_reconnect && !tcon->need_reconnect) + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { + spin_unlock(&ses->chan_lock); return 0; + } + spin_unlock(&ses->chan_lock); + cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d", + tcon->ses->chans_need_reconnect, + tcon->need_reconnect); nls_codepage = load_nls_default(); /* - * need to prevent multiple threads trying to simultaneously reconnect - * the same SMB session - */ - mutex_lock(&tcon->ses->session_mutex); - - /* * Recheck after acquire mutex. If another thread is negotiating * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); rc = -EHOSTDOWN; - mutex_unlock(&tcon->ses->session_mutex); goto out; } + spin_unlock(&cifs_tcp_ses_lock); /* - * If we are reconnecting an extra channel, bind + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session */ - if (CIFS_SERVER_IS_CHAN(server)) { - ses->binding = true; - ses->binding_chan = cifs_ses_find_chan(ses, server); + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + + /* this means that we only need to tree connect */ + if (tcon->need_reconnect) + goto skip_sess_setup; + + goto out; } + spin_unlock(&ses->chan_lock); - rc = cifs_negotiate_protocol(0, tcon->ses); - if (!rc && tcon->ses->need_reconnect) { - rc = cifs_setup_session(0, tcon->ses, nls_codepage); + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses, server); + if (!rc) { + rc = cifs_setup_session(0, ses, server, nls_codepage); if ((rc == -EACCES) && !tcon->retry) { + mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; - ses->binding = false; - ses->binding_chan = NULL; - mutex_unlock(&tcon->ses->session_mutex); goto failed; } + } else { + mutex_unlock(&ses->session_mutex); + goto out; } - /* - * End of channel binding - */ - ses->binding = false; - ses->binding_chan = NULL; + mutex_unlock(&ses->session_mutex); - if (rc || !tcon->need_reconnect) { - mutex_unlock(&tcon->ses->session_mutex); +skip_sess_setup: + mutex_lock(&ses->session_mutex); + if (!tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); goto out; } - cifs_mark_open_files_invalid(tcon); if (tcon->use_persistent) tcon->need_reopen_files = true; rc = cifs_tree_connect(0, tcon, nls_codepage); - mutex_unlock(&tcon->ses->session_mutex); + mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { @@ -833,7 +850,9 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) */ int -SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) +SMB2_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server) { struct smb_rqst rqst; struct smb2_negotiate_req *req; @@ -842,7 +861,6 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct kvec rsp_iov; int rc = 0; int resp_buftype; - struct TCP_Server_Info *server = cifs_ses_server(ses); int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; @@ -1221,6 +1239,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) struct SMB2_sess_data { unsigned int xid; struct cifs_ses *ses; + struct TCP_Server_Info *server; struct nls_table *nls_cp; void (*func)(struct SMB2_sess_data *); int result; @@ -1242,9 +1261,10 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct smb2_sess_setup_req *req; - struct TCP_Server_Info *server = cifs_ses_server(ses); unsigned int total_len; + bool is_binding = false; rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, server, (void **) &req, @@ -1252,11 +1272,16 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) if (rc) return rc; - if (sess_data->ses->binding) { - req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid); + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + + if (is_binding) { + req->hdr.SessionId = cpu_to_le64(ses->Suid); req->hdr.Flags |= SMB2_FLAGS_SIGNED; req->PreviousSessionId = 0; req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; + cifs_dbg(FYI, "Binding to sess id: %llx\n", ses->Suid); } else { /* First session, not a reauthenticate */ req->hdr.SessionId = 0; @@ -1266,6 +1291,8 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) */ req->PreviousSessionId = cpu_to_le64(sess_data->previous_session); req->Flags = 0; /* MBZ */ + cifs_dbg(FYI, "Fresh session. Previous: %llx\n", + sess_data->previous_session); } /* enough to enable echos and oplocks and one max size write */ @@ -1325,7 +1352,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* BB add code to build os and lm fields */ rc = cifs_send_recv(sess_data->xid, sess_data->ses, - cifs_ses_server(sess_data->ses), + sess_data->server, &rqst, &sess_data->buf0_type, CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov); @@ -1340,11 +1367,11 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) { int rc = 0; struct cifs_ses *ses = sess_data->ses; - struct TCP_Server_Info *server = cifs_ses_server(ses); + struct TCP_Server_Info *server = sess_data->server; mutex_lock(&server->srv_mutex); if (server->ops->generate_signingkey) { - rc = server->ops->generate_signingkey(ses); + rc = server->ops->generate_signingkey(ses, server); if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); @@ -1359,14 +1386,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - /* keep existing ses state if binding */ - if (!ses->binding) { - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); - } - return rc; } @@ -1376,15 +1395,17 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct cifs_spnego_msg *msg; struct key *spnego_key = NULL; struct smb2_sess_setup_rsp *rsp = NULL; + bool is_binding = false; rc = SMB2_sess_alloc_buffer(sess_data); if (rc) goto out; - spnego_key = cifs_get_spnego_key(ses); + spnego_key = cifs_get_spnego_key(ses, server); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); if (rc == -ENOKEY) @@ -1405,8 +1426,12 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; } + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + /* keep session key if binding */ - if (!ses->binding) { + if (!is_binding) { ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { @@ -1427,7 +1452,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; /* keep session id and flags if binding */ - if (!ses->binding) { + if (!is_binding) { ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1459,10 +1484,12 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct smb2_sess_setup_rsp *rsp = NULL; unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ u16 blob_length = 0; + bool is_binding = false; /* * If memory allocation is successful, caller of this function @@ -1479,8 +1506,8 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) if (rc) goto out_err; - rc = build_ntlmssp_negotiate_blob(&ntlmssp_blob, - &blob_length, ses, + rc = build_ntlmssp_smb3_negotiate_blob(&ntlmssp_blob, + &blob_length, ses, server, sess_data->nls_cp); if (rc) goto out_err; @@ -1519,8 +1546,12 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + /* keep existing ses id and flags if binding */ - if (!ses->binding) { + if (!is_binding) { ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1545,11 +1576,13 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) { int rc; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; struct smb2_sess_setup_req *req; struct smb2_sess_setup_rsp *rsp = NULL; unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ u16 blob_length = 0; + bool is_binding = false; rc = SMB2_sess_alloc_buffer(sess_data); if (rc) @@ -1558,8 +1591,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; req->hdr.SessionId = cpu_to_le64(ses->Suid); - rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, - sess_data->nls_cp); + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, + ses, server, + sess_data->nls_cp); if (rc) { cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc); goto out; @@ -1580,8 +1614,12 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + /* keep existing ses id and flags if binding */ - if (!ses->binding) { + if (!is_binding) { ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1612,11 +1650,13 @@ out: } static int -SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) +SMB2_select_sec(struct SMB2_sess_data *sess_data) { int type; + struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = sess_data->server; - type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype); + type = smb2_select_sectype(server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, "Unable to select appropriate authentication method!\n"); @@ -1640,10 +1680,10 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp) { int rc = 0; - struct TCP_Server_Info *server = cifs_ses_server(ses); struct SMB2_sess_data *sess_data; cifs_dbg(FYI, "Session Setup\n"); @@ -1657,15 +1697,17 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, if (!sess_data) return -ENOMEM; - rc = SMB2_select_sec(ses, sess_data); - if (rc) - goto out; sess_data->xid = xid; sess_data->ses = ses; + sess_data->server = server; sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; sess_data->previous_session = ses->Suid; + rc = SMB2_select_sec(sess_data); + if (rc) + goto out; + /* * Initialize the session hash with the server one. */ @@ -1704,8 +1746,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) return -EIO; /* no need to send SMB logoff if uid already closed due to reconnect */ - if (ses->need_reconnect) + spin_lock(&ses->chan_lock); + if (CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { + spin_unlock(&ses->chan_lock); goto smb2_session_already_dead; + } + spin_unlock(&ses->chan_lock); rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, ses->server, (void **) &req, &total_len); @@ -1867,8 +1913,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->share_flags = le32_to_cpu(rsp->ShareFlags); tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); - tcon->tidStatus = CifsGood; - tcon->need_reconnect = false; tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); @@ -1913,8 +1957,13 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (!ses || !(ses->server)) return -EIO; - if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) + spin_lock(&ses->chan_lock); + if ((tcon->need_reconnect) || + (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) { + spin_unlock(&ses->chan_lock); return 0; + } + spin_unlock(&ses->chan_lock); close_cached_dir_lease(&tcon->crfid); @@ -2527,8 +2576,13 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, cp = load_nls_default(); cifs_strtoUTF16(*out_path, treename, treename_len, cp); - UniStrcat(*out_path, sep); - UniStrcat(*out_path, path); + + /* Do not append the separator if the path is empty */ + if (path[0] != cpu_to_le16(0x0000)) { + UniStrcat(*out_path, sep); + UniStrcat(*out_path, path); + } + unload_nls(cp); return 0; @@ -3722,27 +3776,35 @@ void smb2_reconnect_server(struct work_struct *work) { struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, reconnect.work); - struct cifs_ses *ses; + struct TCP_Server_Info *pserver; + struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; - struct list_head tmp_list; - int tcon_exist = false; + struct list_head tmp_list, tmp_ses_list; + bool tcon_exist = false, ses_exist = false; + bool tcon_selected = false; int rc; - int resched = false; + bool resched = false; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ - mutex_lock(&server->reconnect_mutex); + mutex_lock(&pserver->reconnect_mutex); INIT_LIST_HEAD(&tmp_list); - cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + INIT_LIST_HEAD(&tmp_ses_list); + cifs_dbg(FYI, "Reconnecting tcons and channels\n"); spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + + tcon_selected = false; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_exist = true; + tcon_selected = tcon_exist = true; } } /* @@ -3751,15 +3813,27 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_exist = true; + tcon_selected = tcon_exist = true; ses->ses_count++; } + /* + * handle the case where channel needs to reconnect + * binding session, but tcon is healthy (some other channel + * is active) + */ + spin_lock(&ses->chan_lock); + if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { + list_add_tail(&ses->rlist, &tmp_ses_list); + ses_exist = true; + ses->ses_count++; + } + spin_unlock(&ses->chan_lock); } /* * Get the reference to server struct to be sure that the last call of * cifs_put_tcon() in the loop below won't release the server pointer. */ - if (tcon_exist) + if (tcon_exist || ses_exist) server->srv_count++; spin_unlock(&cifs_tcp_ses_lock); @@ -3777,13 +3851,41 @@ void smb2_reconnect_server(struct work_struct *work) cifs_put_tcon(tcon); } - cifs_dbg(FYI, "Reconnecting tcons finished\n"); + if (!ses_exist) + goto done; + + /* allocate a dummy tcon struct used for reconnect */ + tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + if (!tcon) { + resched = true; + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + goto done; + } + + tcon->tidStatus = CifsGood; + tcon->retry = false; + tcon->need_reconnect = false; + + /* now reconnect sessions for necessary channels */ + list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { + tcon->ses = ses; + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server); + if (rc) + resched = true; + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + } + kfree(tcon); + +done: + cifs_dbg(FYI, "Reconnecting tcons and channels finished\n"); if (resched) queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ); - mutex_unlock(&server->reconnect_mutex); + mutex_unlock(&pserver->reconnect_mutex); /* now we can safely release srv struct */ - if (tcon_exist) + if (tcon_exist || ses_exist) cifs_put_tcp_session(server, 1); } @@ -3797,13 +3899,16 @@ SMB2_echo(struct TCP_Server_Info *server) .rq_nvec = 1 }; unsigned int total_len; - cifs_dbg(FYI, "In echo request\n"); + cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); /* No need to send echo on newly established connections */ mod_delayed_work(cifsiod_wq, &server->reconnect, 0); return rc; } + spin_unlock(&cifs_tcp_ses_lock); rc = smb2_plain_req_init(SMB2_ECHO, NULL, server, (void **)&req, &total_len); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 096fada16ebd..4a7062fd1c26 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -123,8 +123,11 @@ extern void smb2_set_related(struct smb_rqst *rqst); * SMB2 Worker functions - most of protocol specific implementation details * are contained within these calls. */ -extern int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses); +extern int SMB2_negotiate(const unsigned int xid, + struct cifs_ses *ses, + struct TCP_Server_Info *server); extern int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct nls_table *nls_cp); extern int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses); extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, @@ -276,6 +279,7 @@ extern void smb2_copy_fs_info_to_kstatfs( struct kstatfs *kst); extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); extern int smb311_update_preauth_hash(struct cifs_ses *ses, + struct TCP_Server_Info *server, struct kvec *iov, int nvec); extern int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 2bf047b390a9..2af79093b78b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -100,13 +100,16 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) goto out; found: - if (ses->binding) { + spin_lock(&ses->chan_lock); + if (cifs_chan_needs_reconnect(ses, server) && + !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { /* * If we are in the process of binding a new channel * to an existing session, use the master connection * session key */ memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); goto out; } @@ -118,9 +121,11 @@ found: chan = ses->chans + i; if (chan->server == server) { memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); goto out; } } + spin_unlock(&ses->chan_lock); cifs_dbg(VFS, "%s: Could not find channel signing key for session 0x%llx\n", @@ -390,12 +395,18 @@ struct derivation_triplet { static int generate_smb3signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server, const struct derivation_triplet *ptriplet) { int rc; -#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS - struct TCP_Server_Info *server = ses->server; -#endif + bool is_binding = false; + int chan_index = 0; + + spin_lock(&ses->chan_lock); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + chan_index = cifs_ses_get_chan_index(ses, server); + /* TODO: introduce ref counting for channels when the can be freed */ + spin_unlock(&ses->chan_lock); /* * All channels use the same encryption/decryption keys but @@ -407,10 +418,10 @@ generate_smb3signingkey(struct cifs_ses *ses, * master connection signing key stored in the session */ - if (ses->binding) { + if (is_binding) { rc = generate_key(ses, ptriplet->signing.label, ptriplet->signing.context, - cifs_ses_binding_channel(ses)->signkey, + ses->chans[chan_index].signkey, SMB3_SIGN_KEY_SIZE); if (rc) return rc; @@ -422,8 +433,11 @@ generate_smb3signingkey(struct cifs_ses *ses, if (rc) return rc; + /* safe to access primary channel, since it will never go away */ + spin_lock(&ses->chan_lock); memcpy(ses->chans[0].signkey, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); rc = generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, @@ -470,7 +484,8 @@ generate_smb3signingkey(struct cifs_ses *ses, } int -generate_smb30signingkey(struct cifs_ses *ses) +generate_smb30signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server) { struct derivation_triplet triplet; @@ -494,11 +509,12 @@ generate_smb30signingkey(struct cifs_ses *ses) d->context.iov_base = "ServerOut"; d->context.iov_len = 10; - return generate_smb3signingkey(ses, &triplet); + return generate_smb3signingkey(ses, server, &triplet); } int -generate_smb311signingkey(struct cifs_ses *ses) +generate_smb311signingkey(struct cifs_ses *ses, + struct TCP_Server_Info *server) { struct derivation_triplet triplet; @@ -522,7 +538,7 @@ generate_smb311signingkey(struct cifs_ses *ses) d->context.iov_base = ses->preauth_sha_hash; d->context.iov_len = 64; - return generate_smb3signingkey(ses, &triplet); + return generate_smb3signingkey(ses, server, &triplet); } int @@ -624,8 +640,12 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (!is_signed) return 0; - if (server->tcpStatus == CifsNeedNegotiate) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsNeedNegotiate) { + spin_unlock(&cifs_tcp_ses_lock); return 0; + } + spin_unlock(&cifs_tcp_ses_lock); if (!is_binding && !server->session_estab) { strncpy(shdr->Signature, "BSRSPYL", 8); return 0; @@ -741,30 +761,41 @@ static int smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, struct smb2_hdr *shdr, struct mid_q_entry **mid) { - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } if (server->tcpStatus == CifsNeedNegotiate && - shdr->Command != SMB2_NEGOTIATE) + shdr->Command != SMB2_NEGOTIATE) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } if (ses->status == CifsNew) { if ((shdr->Command != SMB2_SESSION_SETUP) && - (shdr->Command != SMB2_NEGOTIATE)) + (shdr->Command != SMB2_NEGOTIATE)) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are setting up session */ } if (ses->status == CifsExiting) { - if (shdr->Command != SMB2_LOGOFF) + if (shdr->Command != SMB2_LOGOFF) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are shutting down the session */ } + spin_unlock(&cifs_tcp_ses_lock); *mid = smb2_mid_entry_alloc(shdr, server); if (*mid == NULL) @@ -837,9 +868,13 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate && - shdr->Command != SMB2_NEGOTIATE) + shdr->Command != SMB2_NEGOTIATE) { + spin_unlock(&cifs_tcp_ses_lock); return ERR_PTR(-EAGAIN); + } + spin_unlock(&cifs_tcp_ses_lock); smb2_seq_num_into_buf(server, shdr); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 61ea3d3f95b4..a4c3e027cca2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -430,9 +430,7 @@ unmask: * be taken as the remainder of this one. We need to kill the * socket so the server throws away the partial SMB */ - spin_lock(&GlobalMid_Lock); - server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); + cifs_mark_tcp_ses_conns_for_reconnect(server, false); trace_smb3_partial_send_reconnect(server->CurrentMid, server->conn_id, server->hostname); } @@ -578,10 +576,14 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, return -ERESTARTSYS; spin_lock(&server->req_lock); } else { + spin_unlock(&server->req_lock); + + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&server->req_lock); + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; } + spin_unlock(&cifs_tcp_ses_lock); /* * For normal commands, reserve the last MAX_COMPOUND @@ -596,6 +598,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, * for servers that are slow to hand out credits on * new sessions. */ + spin_lock(&server->req_lock); if (!optype && num_credits == 1 && server->in_flight > 2 * MAX_COMPOUND && *credits <= MAX_COMPOUND) { @@ -723,28 +726,25 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { - if (ses->server->tcpStatus == CifsExiting) { - return -ENOENT; - } - - if (ses->server->tcpStatus == CifsNeedReconnect) { - cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); - return -EAGAIN; - } - + spin_lock(&cifs_tcp_ses_lock); if (ses->status == CifsNew) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && - (in_buf->Command != SMB_COM_NEGOTIATE)) + (in_buf->Command != SMB_COM_NEGOTIATE)) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are setting up session */ } if (ses->status == CifsExiting) { /* check if SMB session is bad because we are setting it up */ - if (in_buf->Command != SMB_COM_LOGOFF_ANDX) + if (in_buf->Command != SMB_COM_LOGOFF_ANDX) { + spin_unlock(&cifs_tcp_ses_lock); return -EAGAIN; + } /* else ok - we are shutting down session */ } + spin_unlock(&cifs_tcp_ses_lock); *ppmidQ = AllocMidQEntry(in_buf, ses->server); if (*ppmidQ == NULL) @@ -1044,19 +1044,14 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!ses) return NULL; + /* round robin */ + index = (uint)atomic_inc_return(&ses->chan_seq); + spin_lock(&ses->chan_lock); - if (!ses->binding) { - /* round robin */ - if (ses->chan_count > 1) { - index = (uint)atomic_inc_return(&ses->chan_seq); - index %= ses->chan_count; - } - spin_unlock(&ses->chan_lock); - return ses->chans[index].server; - } else { - spin_unlock(&ses->chan_lock); - return cifs_ses_server(ses); - } + index %= ses->chan_count; + spin_unlock(&ses->chan_lock); + + return ses->chans[index].server; } int @@ -1084,8 +1079,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } + spin_unlock(&cifs_tcp_ses_lock); /* * Wait for all the requests to become available. @@ -1188,12 +1187,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ + spin_lock(&cifs_tcp_ses_lock); if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + spin_unlock(&cifs_tcp_ses_lock); + mutex_lock(&server->srv_mutex); - smb311_update_preauth_hash(ses, rqst[0].rq_iov, - rqst[0].rq_nvec); + smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec); mutex_unlock(&server->srv_mutex); + + spin_lock(&cifs_tcp_ses_lock); } + spin_unlock(&cifs_tcp_ses_lock); for (i = 0; i < num_rqst; i++) { rc = wait_for_response(server, midQ[i]); @@ -1256,15 +1260,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ + spin_lock(&cifs_tcp_ses_lock); if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len }; + spin_unlock(&cifs_tcp_ses_lock); mutex_lock(&server->srv_mutex); - smb311_update_preauth_hash(ses, &iov, 1); + smb311_update_preauth_hash(ses, server, &iov, 1); mutex_unlock(&server->srv_mutex); + spin_lock(&cifs_tcp_ses_lock); } + spin_unlock(&cifs_tcp_ses_lock); out: /* @@ -1353,8 +1361,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } + spin_unlock(&cifs_tcp_ses_lock); /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -1494,8 +1506,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - if (server->tcpStatus == CifsExiting) + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); return -ENOENT; + } + spin_unlock(&cifs_tcp_ses_lock); /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -1553,10 +1569,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ + spin_lock(&cifs_tcp_ses_lock); if ((rc == -ERESTARTSYS) && (midQ->mid_state == MID_REQUEST_SUBMITTED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { + spin_unlock(&cifs_tcp_ses_lock); if (in_buf->Command == SMB_COM_TRANSACTION2) { /* POSIX lock. We send a NT_CANCEL SMB to cause the @@ -1595,7 +1613,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* We got the response - restart system call. */ rstart = 1; + spin_lock(&cifs_tcp_ses_lock); } + spin_unlock(&cifs_tcp_ses_lock); rc = cifs_sync_mid_result(midQ, server); if (rc != 0) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 7d8b72d67c80..9d486fbbfbbd 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -175,11 +175,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler, switch (handler->flags) { case XATTR_CIFS_NTSD_FULL: aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP | CIFS_ACL_DACL | CIFS_ACL_SACL); break; case XATTR_CIFS_NTSD: aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP | CIFS_ACL_DACL); break; case XATTR_CIFS_ACL: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 1466b5d01cbb..d3cd2a94d1e8 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1780,8 +1780,8 @@ void configfs_unregister_group(struct config_group *group) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); + d_drop(dentry); fsnotify_rmdir(d_inode(parent), dentry); - d_delete(dentry); inode_unlock(d_inode(parent)); dput(dentry); @@ -1922,10 +1922,10 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(dentry)); - d_delete(dentry); + d_drop(dentry); + fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(root)); diff --git a/fs/coredump.c b/fs/coredump.c index a6b3c196cdef..1c060c0a2d72 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -41,6 +41,7 @@ #include <linux/fs.h> #include <linux/path.h> #include <linux/timekeeping.h> +#include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -52,9 +53,9 @@ #include <trace/events/sched.h> -int core_uses_pid; -unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_uses_pid; +static unsigned int core_pipe_limit; +static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -62,8 +63,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -347,13 +346,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start, int exit_code, int flags) +static int zap_process(struct task_struct *start, int exit_code) { struct task_struct *t; int nr = 0; /* ignore all signals except SIGKILL, see prepare_signal() */ - start->signal->flags = SIGNAL_GROUP_COREDUMP | flags; + start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -372,13 +371,13 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) static int zap_threads(struct task_struct *tsk, struct core_state *core_state, int exit_code) { + struct signal_struct *signal = tsk->signal; int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); - if (!signal_group_exit(tsk->signal)) { - tsk->signal->core_state = core_state; - tsk->signal->group_exit_task = tsk; - nr = zap_process(tsk, exit_code, 0); + if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) { + signal->core_state = core_state; + nr = zap_process(tsk, exit_code); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); tsk->flags |= PF_DUMPCORE; atomic_set(&core_state->nr_threads, nr); @@ -426,8 +425,6 @@ static void coredump_finish(bool core_dumped) spin_lock_irq(¤t->sighand->siglock); if (core_dumped && !__fatal_signal_pending(current)) current->signal->group_exit_code |= 0x80; - current->signal->group_exit_task = NULL; - current->signal->flags = SIGNAL_GROUP_EXIT; next = current->signal->core_state->dumper.next; current->signal->core_state = NULL; spin_unlock_irq(¤t->sighand->siglock); @@ -895,6 +892,63 @@ int dump_align(struct coredump_params *cprm, int align) } EXPORT_SYMBOL(dump_align); +#ifdef CONFIG_SYSCTL + +void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table coredump_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static int __init init_fs_coredump_sysctls(void) +{ + register_sysctl_init("kernel", coredump_sysctls); + return 0; +} +fs_initcall(init_fs_coredump_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. @@ -709,26 +709,26 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, return __dax_invalidate_entry(mapping, index, false); } -static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, struct page *to, unsigned long vaddr) +static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) { + return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); +} + +static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) +{ + pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); void *vto, *kaddr; - pgoff_t pgoff; long rc; int id; - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; } - vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)kaddr, vaddr, to); + vto = kmap_atomic(vmf->cow_page); + copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); kunmap_atomic(vto); dax_read_unlock(id); return 0; @@ -1005,22 +1005,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) -{ - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; -} - static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, pfn_t *pfnp) { - const sector_t sector = dax_iomap_sector(iomap, pos); - pgoff_t pgoff; + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id, rc; long length; - rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); - if (rc) - return rc; id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), NULL, pfnp); @@ -1126,42 +1117,87 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) +static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, + unsigned int offset, size_t size) { - sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); - pgoff_t pgoff; - long rc, id; void *kaddr; - bool page_aligned = false; - unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); + long ret; - if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && - (size == PAGE_SIZE)) - page_aligned = true; + ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + if (ret > 0) { + memset(kaddr + offset, 0, size); + dax_flush(dax_dev, kaddr + offset, size); + } + return ret; +} - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; +static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +{ + const struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + u64 length = iomap_length(iter); + s64 written = 0; + + /* already zeroed? we're done. */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + do { + unsigned offset = offset_in_page(pos); + unsigned size = min_t(u64, PAGE_SIZE - offset, length); + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); + long rc; + int id; + + id = dax_read_lock(); + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + else + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size); + dax_read_unlock(id); - id = dax_read_lock(); + if (rc < 0) + return rc; + pos += size; + length -= size; + written += size; + if (did_zero) + *did_zero = true; + } while (length > 0); - if (page_aligned) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); - else - rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); - if (rc < 0) { - dax_read_unlock(id); - return rc; - } + return written; +} - if (!page_aligned) { - memset(kaddr + offset, 0, size); - dax_flush(iomap->dax_dev, kaddr + offset, size); - } - dax_read_unlock(id); - return size; +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_DAX | IOMAP_ZERO, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_zero_iter(&iter, did_zero); + return ret; } +EXPORT_SYMBOL_GPL(dax_zero_range); + +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + const struct iomap_ops *ops) +{ + unsigned int blocksize = i_blocksize(inode); + unsigned int off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) @@ -1169,7 +1205,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const struct iomap *iomap = &iomi->iomap; loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; ssize_t ret = 0; @@ -1203,9 +1238,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); - const sector_t sector = dax_iomap_sector(iomap, pos); + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; - pgoff_t pgoff; void *kaddr; if (fatal_signal_pending(current)) { @@ -1213,10 +1247,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); - if (ret) - break; - map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL); if (map_len < 0) { @@ -1230,11 +1260,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - /* - * The userspace address for the memory copy has already been - * validated via access_ok() in either vfs_read() or - * vfs_write(), depending on which operation we are doing. - */ if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); @@ -1274,6 +1299,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(iter), + .flags = IOMAP_DAX, }; loff_t done = 0; int ret; @@ -1332,19 +1358,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, const struct iomap_iter *iter) { - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos); - unsigned long vaddr = vmf->address; vm_fault_t ret; int error = 0; switch (iter->iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: - clear_user_highpage(vmf->cow_page, vaddr); + clear_user_highpage(vmf->cow_page, vmf->address); break; case IOMAP_MAPPED: - error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev, - sector, vmf->cow_page, vaddr); + error = copy_cow_page_dax(vmf, iter); break; default: WARN_ON_ONCE(1); @@ -1430,7 +1453,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, .inode = mapping->host, .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, .len = PAGE_SIZE, - .flags = IOMAP_FAULT, + .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = 0; void *entry; @@ -1539,7 +1562,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct iomap_iter iter = { .inode = mapping->host, .len = PMD_SIZE, - .flags = IOMAP_FAULT, + .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; diff --git a/fs/dcache.c b/fs/dcache.c index cf871a81f4fd..c84269c6e8bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -115,10 +115,13 @@ static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } - -/* Statistics gathering. */ -struct dentry_stat_t dentry_stat = { - .age_limit = 45, +struct dentry_stat_t { + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); @@ -126,6 +129,10 @@ static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +/* Statistics gathering. */ +static struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; /* * Here we resort to our own counters instead of using generic per-cpu counters @@ -167,14 +174,32 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table fs_dcache_sysctls[] = { + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { } +}; + +static int __init init_fs_dcache_sysctls(void) +{ + register_sysctl_init("fs", fs_dcache_sysctls); + return 0; +} +fs_initcall(init_fs_dcache_sysctls); #endif /* diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 7d162b0efbf0..950c63fa4d0b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -147,7 +147,7 @@ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { - if ((inode->i_mode & 07777) == 0444 && + if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 42e5a766d33c..4f25015aa534 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dentry) dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); - fsnotify_unlink(d_inode(dentry->d_parent), dentry); d_drop(dentry); + fsnotify_unlink(d_inode(dentry->d_parent), dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 283c7b94edda..bfac462dd3e8 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -9,6 +9,8 @@ ******************************************************************************* ******************************************************************************/ +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lock.h" #include "user.h" @@ -254,10 +256,12 @@ void dlm_callback_work(struct work_struct *work) continue; } else if (callbacks[i].flags & DLM_CB_BAST) { bastfn(lkb->lkb_astparam, callbacks[i].mode); + trace_dlm_bast(ls, lkb, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; castfn(lkb->lkb_astparam); + trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } @@ -295,7 +299,8 @@ void dlm_callback_suspend(struct dlm_ls *ls) void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; - int count = 0; + int count = 0, sum = 0; + bool empty; clear_bit(LSFL_CB_DELAY, &ls->ls_flags); @@ -311,14 +316,17 @@ more: if (count == MAX_CB_QUEUE) break; } + empty = list_empty(&ls->ls_cb_delay); mutex_unlock(&ls->ls_cb_mutex); - if (count) - log_rinfo(ls, "dlm_callback_resume %d", count); - if (count == MAX_CB_QUEUE) { + sum += count; + if (!empty) { count = 0; cond_resched(); goto more; } + + if (sum) + log_rinfo(ls, "%s %d", __func__, sum); } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 47e9d57e4cae..8fb04ebbafb5 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -635,6 +635,35 @@ static int table_open2(struct inode *inode, struct file *file) return 0; } +static ssize_t table_write2(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + int n, len, lkb_nodeid, lkb_status, error; + char name[DLM_RESNAME_MAXLEN + 1] = {}; + struct dlm_ls *ls = seq->private; + unsigned int lkb_flags; + char buf[256] = {}; + uint32_t lkb_id; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d", + &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status); + if (n != 5) + return -EINVAL; + + len = strnlen(name, DLM_RESNAME_MAXLEN); + error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags, + lkb_nodeid, lkb_status); + if (error) + return error; + + return count; +} + static int table_open3(struct inode *inode, struct file *file) { struct seq_file *seq; @@ -675,6 +704,7 @@ static const struct file_operations format2_fops = { .owner = THIS_MODULE, .open = table_open2, .read = seq_read, + .write = table_write2, .llseek = seq_lseek, .release = seq_release }; @@ -724,10 +754,35 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf, return rv; } +static ssize_t waiters_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + int mstype, to_nodeid; + char buf[128] = {}; + uint32_t lkb_id; + int n, error; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid); + if (n != 3) + return -EINVAL; + + error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid); + if (error) + return error; + + return count; +} + static const struct file_operations waiters_fops = { .owner = THIS_MODULE, .open = simple_open, .read = waiters_read, + .write = waiters_write, .llseek = default_llseek, }; @@ -768,6 +823,42 @@ static int dlm_version_show(struct seq_file *file, void *offset) } DEFINE_SHOW_ATTRIBUTE(dlm_version); +static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + void *buf; + int ret; + + if (count > PAGE_SIZE || count < sizeof(struct dlm_header)) + return -EINVAL; + + buf = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, count)) { + ret = -EFAULT; + goto out; + } + + ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count); + if (ret) + goto out; + + kfree(buf); + return count; + +out: + kfree(buf); + return ret; +} + +static const struct file_operations dlm_rawmsg_fops = { + .open = simple_open, + .write = dlm_rawmsg_write, + .llseek = no_llseek, +}; + void *dlm_create_debug_comms_file(int nodeid, void *data) { struct dentry *d_node; @@ -782,6 +873,7 @@ void *dlm_create_debug_comms_file(int nodeid, void *data) debugfs_create_file("send_queue_count", 0444, d_node, data, &dlm_send_queue_cnt_fops); debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops); return d_node; } @@ -809,7 +901,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, + 0644, dlm_root, ls, &format2_fops); @@ -840,7 +932,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, + 0644, dlm_root, ls, &waiters_fops); diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 45ebbe602bbf..b6692f81ec83 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -84,8 +84,7 @@ int dlm_recover_directory(struct dlm_ls *ls) for (;;) { int left; - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; goto out_free; } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5f57538b5d45..74a9590a4dd5 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -41,12 +41,6 @@ #include <linux/dlm.h> #include "config.h" -/* Size of the temp buffer midcomms allocates on the stack. - We try to make this large enough so most messages fit. - FIXME: should sctp make this unnecessary? */ - -#define DLM_INBUF_LEN 148 - struct dlm_ls; struct dlm_lkb; struct dlm_rsb; @@ -554,8 +548,9 @@ struct dlm_ls { uint32_t ls_generation; uint32_t ls_exflags; int ls_lvblen; - int ls_count; /* refcount of processes in + atomic_t ls_count; /* refcount of processes in the dlm using this ls */ + wait_queue_head_t ls_count_wait; int ls_create_count; /* create/release refcount */ unsigned long ls_flags; /* LSFL_ */ unsigned long ls_scan_time; @@ -581,6 +576,7 @@ struct dlm_ls { struct list_head ls_new_rsb; /* new rsb structs */ spinlock_t ls_remove_spin; + wait_queue_head_t ls_remove_wait; char ls_remove_name[DLM_RESNAME_MAXLEN+1]; char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; int ls_remove_len; @@ -632,6 +628,8 @@ struct dlm_ls { struct rw_semaphore ls_in_recovery; /* block local requests */ struct rw_semaphore ls_recv_active; /* block dlm_recv */ struct list_head ls_requestqueue;/* queue remote requests */ + atomic_t ls_requestqueue_cnt; + wait_queue_head_t ls_requestqueue_wait; struct mutex ls_requestqueue_mutex; struct dlm_rcom *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index c502c065d007..bdb51d209ba2 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -53,6 +53,8 @@ R: do_xxxx() L: receive_xxxx_reply() <- R: send_xxxx_reply() */ +#include <trace/events/dlm.h> + #include <linux/types.h> #include <linux/rbtree.h> #include <linux/slab.h> @@ -1178,7 +1180,8 @@ static void detach_lkb(struct dlm_lkb *lkb) } } -static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, + int start, int end) { struct dlm_lkb *lkb; int rv; @@ -1199,7 +1202,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) idr_preload(GFP_NOFS); spin_lock(&ls->ls_lkbidr_spin); - rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT); if (rv >= 0) lkb->lkb_id = rv; spin_unlock(&ls->ls_lkbidr_spin); @@ -1215,6 +1218,11 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) return 0; } +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +{ + return _create_lkb(ls, lkb_ret, 1, 0); +} + static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; @@ -1618,21 +1626,24 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) } /* If there's an rsb for the same resource being removed, ensure - that the remove message is sent before the new lookup message. - It should be rare to need a delay here, but if not, then it may - be worthwhile to add a proper wait mechanism rather than a delay. */ + * that the remove message is sent before the new lookup message. + */ + +#define DLM_WAIT_PENDING_COND(ls, r) \ + (ls->ls_remove_len && \ + !rsb_cmp(r, ls->ls_remove_name, \ + ls->ls_remove_len)) static void wait_pending_remove(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; restart: spin_lock(&ls->ls_remove_spin); - if (ls->ls_remove_len && - !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) { + if (DLM_WAIT_PENDING_COND(ls, r)) { log_debug(ls, "delay lookup for remove dir %d %s", - r->res_dir_nodeid, r->res_name); + r->res_dir_nodeid, r->res_name); spin_unlock(&ls->ls_remove_spin); - msleep(1); + wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r)); goto restart; } spin_unlock(&ls->ls_remove_spin); @@ -1784,6 +1795,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); + wake_up(&ls->ls_remove_wait); send_remove(r); @@ -3437,6 +3449,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; + trace_dlm_lock_start(ls, lkb, mode, flags); + error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); if (error) @@ -3450,6 +3464,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: + trace_dlm_lock_end(ls, lkb, mode, flags, error); + if (convert || error) __put_lkb(ls, lkb); if (error == -EAGAIN || error == -EDEADLK) @@ -3481,6 +3497,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + error = set_unlock_args(flags, astarg, &args); if (error) goto out_put; @@ -3495,6 +3513,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK))) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -3973,6 +3993,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) int from = ms->m_header.h_nodeid; int error = 0; + /* currently mixing of user/kernel locks are not supported */ + if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) { + log_error(lkb->lkb_resource->res_ls, + "got user dlm message for a kernel lock"); + error = -EINVAL; + goto out; + } + switch (ms->m_type) { case DLM_MSG_CONVERT: case DLM_MSG_UNLOCK: @@ -4001,6 +4029,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) error = -EINVAL; } +out: if (error) log_error(lkb->lkb_resource->res_ls, "ignore invalid message %d from %d %x %x %x %d", @@ -4050,6 +4079,7 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); + wake_up(&ls->ls_remove_wait); rv = _create_message(ls, sizeof(struct dlm_message) + len, dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); @@ -6301,3 +6331,64 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, return error; } +/* debug functionality */ +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status) +{ + struct dlm_lksb *lksb; + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + /* we currently can't set a valid user lock */ + if (lkb_flags & DLM_IFL_USER) + return -EOPNOTSUPP; + + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); + if (!lksb) + return -ENOMEM; + + error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1); + if (error) { + kfree(lksb); + return error; + } + + lkb->lkb_flags = lkb_flags; + lkb->lkb_nodeid = lkb_nodeid; + lkb->lkb_lksb = lksb; + /* user specific pointer, just don't have it NULL for kernel locks */ + if (~lkb_flags & DLM_IFL_USER) + lkb->lkb_astparam = (void *)0xDEADBEEF; + + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); + if (error) { + kfree(lksb); + __put_lkb(ls, lkb); + return error; + } + + lock_rsb(r); + attach_lkb(r, lkb); + add_lkb(r, lkb, lkb_status); + unlock_rsb(r); + put_rsb(r); + + return 0; +} + +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, lkb_id, &lkb); + if (error) + return error; + + error = add_to_waiters(lkb, mstype, to_nodeid); + dlm_put_lkb(lkb); + return error; +} + diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 456c6ec3ef6f..252a5898f908 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -58,6 +58,10 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, int nodeid, int pid); int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status); +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid); static inline int is_master(struct dlm_rsb *r) { diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 10eddfa6c3d7..0d3833a124a3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -216,8 +216,7 @@ static int do_uevent(struct dlm_ls *ls, int in) return ls->ls_uevent_result; } -static int dlm_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int dlm_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); @@ -314,7 +313,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id) list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_global_id == id) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -331,7 +330,7 @@ struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace) spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_local_handle == lockspace) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -348,7 +347,7 @@ struct dlm_ls *dlm_find_lockspace_device(int minor) spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_device.minor == minor) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -360,24 +359,24 @@ struct dlm_ls *dlm_find_lockspace_device(int minor) void dlm_put_lockspace(struct dlm_ls *ls) { - spin_lock(&lslist_lock); - ls->ls_count--; - spin_unlock(&lslist_lock); + if (atomic_dec_and_test(&ls->ls_count)) + wake_up(&ls->ls_count_wait); } static void remove_lockspace(struct dlm_ls *ls) { - for (;;) { - spin_lock(&lslist_lock); - if (ls->ls_count == 0) { - WARN_ON(ls->ls_create_count != 0); - list_del(&ls->ls_list); - spin_unlock(&lslist_lock); - return; - } +retry: + wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0); + + spin_lock(&lslist_lock); + if (atomic_read(&ls->ls_count) != 0) { spin_unlock(&lslist_lock); - ssleep(1); + goto retry; } + + WARN_ON(ls->ls_create_count != 0); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); } static int threads_start(void) @@ -481,7 +480,8 @@ static int new_lockspace(const char *name, const char *cluster, memcpy(ls->ls_name, name, namelen); ls->ls_namelen = namelen; ls->ls_lvblen = lvblen; - ls->ls_count = 0; + atomic_set(&ls->ls_count, 0); + init_waitqueue_head(&ls->ls_count_wait); ls->ls_flags = 0; ls->ls_scan_time = jiffies; @@ -511,6 +511,7 @@ static int new_lockspace(const char *name, const char *cluster, } spin_lock_init(&ls->ls_remove_spin); + init_waitqueue_head(&ls->ls_remove_wait); for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, @@ -564,6 +565,8 @@ static int new_lockspace(const char *name, const char *cluster, init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); INIT_LIST_HEAD(&ls->ls_requestqueue); + atomic_set(&ls->ls_requestqueue_cnt, 0); + init_waitqueue_head(&ls->ls_requestqueue_wait); mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); @@ -868,7 +871,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * until this returns. * * Force has 4 possible values: - * 0 - don't destroy locksapce if it has any LKBs + * 0 - don't destroy lockspace if it has any LKBs * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs * 2 - destroy lockspace regardless of LKBs * 3 - destroy lockspace as part of a forced shutdown diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 8f715c620e1f..e284d696c1fd 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -53,9 +53,12 @@ #include <net/sctp/sctp.h> #include <net/ipv6.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lowcomms.h" #include "midcomms.h" +#include "memory.h" #include "config.h" #define NEEDED_RMEM (4*1024*1024) @@ -84,7 +87,6 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; - struct mutex wq_alloc; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -189,6 +191,24 @@ static const struct dlm_proto_ops *dlm_proto_ops; static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void writequeue_entry_ctor(void *data) +{ + struct writequeue_entry *entry = data; + + INIT_LIST_HEAD(&entry->msgs); +} + +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) +{ + return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry), + 0, 0, writequeue_entry_ctor); +} + +struct kmem_cache *dlm_lowcomms_msg_cache_create(void) +{ + return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL); +} + /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { @@ -199,7 +219,10 @@ static struct writequeue_entry *con_next_wq(struct connection *con) e = list_first_entry(&con->writequeue, struct writequeue_entry, list); - if (e->len == 0) + /* if len is zero nothing is to send, if there are users filling + * buffers we wait until the users are done so we can send more. + */ + if (e->users || e->len == 0) return NULL; return e; @@ -265,8 +288,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } - mutex_init(&con->wq_alloc); - spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -486,11 +507,9 @@ static void lowcomms_data_ready(struct sock *sk) { struct connection *con; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); - read_unlock_bh(&sk->sk_callback_lock); } static void lowcomms_listen_data_ready(struct sock *sk) @@ -505,15 +524,14 @@ static void lowcomms_write_space(struct sock *sk) { struct connection *con; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (!con) - goto out; + return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { log_print("successful connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); - goto out; + return; } clear_bit(SOCK_NOSPACE, &con->sock->flags); @@ -524,8 +542,6 @@ static void lowcomms_write_space(struct sock *sk) } queue_work(send_workqueue, &con->swork); -out: - read_unlock_bh(&sk->sk_callback_lock); } static inline void lowcomms_connect_sock(struct connection *con) @@ -592,42 +608,41 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) static void lowcomms_error_report(struct sock *sk) { struct connection *con; - struct sockaddr_storage saddr; void (*orig_report)(struct sock *) = NULL; + struct inet_sock *inet; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con == NULL) goto out; orig_report = listen_sock.sk_error_report; - if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) { - printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d, port %d, " - "sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, dlm_config.ci_tcp_port, - sk->sk_err, sk->sk_err_soft); - } else if (saddr.ss_family == AF_INET) { - struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; + inet = inet_sk(sk); + switch (sk->sk_family) { + case AF_INET: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d at %pI4, port %d, " + "sending to node %d at %pI4, dport %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, &sin4->sin_addr.s_addr, - dlm_config.ci_tcp_port, sk->sk_err, + con->nodeid, &inet->inet_daddr, + ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); - } else { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr; - + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d at %u.%u.%u.%u, " - "port %d, sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, sin6->sin6_addr.s6_addr32[0], - sin6->sin6_addr.s6_addr32[1], - sin6->sin6_addr.s6_addr32[2], - sin6->sin6_addr.s6_addr32[3], - dlm_config.ci_tcp_port, sk->sk_err, + "sending to node %d at %pI6c, " + "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &sk->sk_v6_daddr, + ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); + break; +#endif + default: + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "invalid socket family %d set, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + sk->sk_family, sk->sk_err, sk->sk_err_soft); + goto out; } /* below sendcon only handling */ @@ -646,7 +661,6 @@ static void lowcomms_error_report(struct sock *sk) queue_work(send_workqueue, &con->swork); out: - read_unlock_bh(&sk->sk_callback_lock); if (orig_report) orig_report(sk); } @@ -666,20 +680,20 @@ static void restore_callbacks(struct socket *sock) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } static void add_listen_sock(struct socket *sock, struct listen_connection *con) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); save_listen_callbacks(sock); con->sock = sock; @@ -687,7 +701,7 @@ static void add_listen_sock(struct socket *sock, struct listen_connection *con) sk->sk_allocation = GFP_NOFS; /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_listen_data_ready; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } /* Make a socket active */ @@ -695,7 +709,7 @@ static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); con->sock = sock; sk->sk_user_data = con; @@ -705,7 +719,7 @@ static void add_sock(struct socket *sock, struct connection *con) sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_error_report = lowcomms_error_report; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -733,7 +747,7 @@ static void dlm_page_release(struct kref *kref) ref); __free_page(e->page); - kfree(e); + dlm_free_writequeue(e); } static void dlm_msg_release(struct kref *kref) @@ -741,7 +755,7 @@ static void dlm_msg_release(struct kref *kref) struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); kref_put(&msg->entry->ref, dlm_page_release); - kfree(msg); + dlm_free_msg(msg); } static void free_entry(struct writequeue_entry *e) @@ -925,6 +939,7 @@ static int receive_from_sock(struct connection *con) msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); if (ret == -EAGAIN) break; else if (ret <= 0) @@ -1013,10 +1028,28 @@ static int accept_from_sock(struct listen_connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { - unsigned char *b=(unsigned char *)&peeraddr; - log_print("connect from non cluster node"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); + switch (peeraddr.ss_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr; + + log_print("connect from non cluster IPv4 node %pI4", + &sin->sin_addr); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr; + + log_print("connect from non cluster IPv6 node %pI6c", + &sin6->sin6_addr); + break; + } +#endif + default: + log_print("invalid family from non cluster node"); + break; + } + sock_release(newsock); return -1; } @@ -1177,33 +1210,33 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -static struct writequeue_entry *new_writequeue_entry(struct connection *con, - gfp_t allocation) +static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; - entry = kzalloc(sizeof(*entry), allocation); + entry = dlm_allocate_writequeue(); if (!entry) return NULL; - entry->page = alloc_page(allocation | __GFP_ZERO); + entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); if (!entry->page) { - kfree(entry); + dlm_free_writequeue(entry); return NULL; } + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->dirty = false; entry->con = con; entry->users = 1; kref_init(&entry->ref); - INIT_LIST_HEAD(&entry->msgs); - return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, - gfp_t allocation, char **ppc, - void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + char **ppc, void (*cb)(void *data), + void *data) { struct writequeue_entry *e; @@ -1215,74 +1248,54 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, *ppc = page_address(e->page) + e->end; if (cb) - cb(mh); + cb(data); e->end += len; e->users++; - spin_unlock(&con->writequeue_lock); - - return e; + goto out; } } - spin_unlock(&con->writequeue_lock); - e = new_writequeue_entry(con, allocation); + e = new_writequeue_entry(con); if (!e) - return NULL; + goto out; kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; atomic_inc(&con->writequeue_cnt); - - spin_lock(&con->writequeue_lock); if (cb) - cb(mh); + cb(data); list_add_tail(&e->list, &con->writequeue); - spin_unlock(&con->writequeue_lock); +out: + spin_unlock(&con->writequeue_lock); return e; }; static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, gfp_t allocation, char **ppc, - void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + void (*cb)(void *data), + void *data) { struct writequeue_entry *e; struct dlm_msg *msg; - bool sleepable; - msg = kzalloc(sizeof(*msg), allocation); + msg = dlm_allocate_msg(allocation); if (!msg) return NULL; - /* this mutex is being used as a wait to avoid multiple "fast" - * new writequeue page list entry allocs in new_wq_entry in - * normal operation which is sleepable context. Without it - * we could end in multiple writequeue entries with one - * dlm message because multiple callers were waiting at - * the writequeue_lock in new_wq_entry(). - */ - sleepable = gfpflags_normal_context(allocation); - if (sleepable) - mutex_lock(&con->wq_alloc); - kref_init(&msg->ref); - e = new_wq_entry(con, len, allocation, ppc, cb, mh); + e = new_wq_entry(con, len, ppc, cb, data); if (!e) { - if (sleepable) - mutex_unlock(&con->wq_alloc); - - kfree(msg); + dlm_free_msg(msg); return NULL; } - if (sleepable) - mutex_unlock(&con->wq_alloc); - + msg->retransmit = false; + msg->orig_msg = NULL; msg->ppc = *ppc; msg->len = len; msg->entry = e; @@ -1291,8 +1304,8 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, } struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, - char **ppc, void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + char **ppc, void (*cb)(void *data), + void *data) { struct connection *con; struct dlm_msg *msg; @@ -1313,7 +1326,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, return NULL; } - msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh); + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data); if (!msg) { srcu_read_unlock(&connections_srcu, idx); return NULL; @@ -1403,7 +1416,6 @@ static void send_to_sock(struct connection *con) if (!e) break; - e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); @@ -1411,6 +1423,7 @@ static void send_to_sock(struct connection *con) ret = kernel_sendpage(con->sock, e->page, offset, len, msg_flags); + trace_dlm_send(con->nodeid, ret); if (ret == -EAGAIN || ret == 0) { if (ret == -EAGAIN && test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && @@ -1680,9 +1693,9 @@ static void _stop_conn(struct connection *con, bool and_other) set_bit(CF_READ_PENDING, &con->flags); set_bit(CF_WRITE_PENDING, &con->flags); if (con->sock && con->sock->sk) { - write_lock_bh(&con->sock->sk->sk_callback_lock); + lock_sock(con->sock->sk); con->sock->sk->sk_user_data = NULL; - write_unlock_bh(&con->sock->sk->sk_callback_lock); + release_sock(con->sock->sk); } if (con->othercon && and_other) _stop_conn(con->othercon, false); @@ -1775,7 +1788,7 @@ static int dlm_listen_for_all(void) result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); + log_print("Can't create comms socket: %d", result); goto out; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 4ccae07cf005..29369feea991 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -38,8 +38,8 @@ void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, - char **ppc, void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh); + char **ppc, void (*cb)(void *data), + void *data); void dlm_lowcomms_commit_msg(struct dlm_msg *msg); void dlm_lowcomms_put_msg(struct dlm_msg *msg); int dlm_lowcomms_resend_msg(struct dlm_msg *msg); @@ -47,6 +47,8 @@ int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_receive_done(int nodeid); +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void); +struct kmem_cache *dlm_lowcomms_msg_cache_create(void); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/main.c b/fs/dlm/main.c index afc66a1346d3..1c5be4b70ac1 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -19,6 +19,9 @@ #include "config.h" #include "lowcomms.h" +#define CREATE_TRACE_POINTS +#include <trace/events/dlm.h> + static int __init init_dlm(void) { int error; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 731d489aa323..61f906e705db 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -442,8 +442,7 @@ static int ping_members(struct dlm_ls *ls) int error = 0; list_for_each_entry(memb, &ls->ls_nodes, list) { - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; break; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 5918f4d39586..ce35c3c19aeb 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -10,32 +10,61 @@ ******************************************************************************/ #include "dlm_internal.h" +#include "midcomms.h" +#include "lowcomms.h" #include "config.h" #include "memory.h" +static struct kmem_cache *writequeue_cache; +static struct kmem_cache *mhandle_cache; +static struct kmem_cache *msg_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) { + writequeue_cache = dlm_lowcomms_writequeue_cache_create(); + if (!writequeue_cache) + goto out; + + mhandle_cache = dlm_midcomms_cache_create(); + if (!mhandle_cache) + goto mhandle; + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) - return -ENOMEM; + goto lkb; + + msg_cache = dlm_lowcomms_msg_cache_create(); + if (!msg_cache) + goto msg; rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), __alignof__(struct dlm_rsb), 0, NULL); - if (!rsb_cache) { - kmem_cache_destroy(lkb_cache); - return -ENOMEM; - } + if (!rsb_cache) + goto rsb; return 0; + +rsb: + kmem_cache_destroy(msg_cache); +msg: + kmem_cache_destroy(lkb_cache); +lkb: + kmem_cache_destroy(mhandle_cache); +mhandle: + kmem_cache_destroy(writequeue_cache); +out: + return -ENOMEM; } void dlm_memory_exit(void) { + kmem_cache_destroy(writequeue_cache); + kmem_cache_destroy(mhandle_cache); + kmem_cache_destroy(msg_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); } @@ -89,3 +118,32 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kmem_cache_free(lkb_cache, lkb); } +struct dlm_mhandle *dlm_allocate_mhandle(void) +{ + return kmem_cache_alloc(mhandle_cache, GFP_NOFS); +} + +void dlm_free_mhandle(struct dlm_mhandle *mhandle) +{ + kmem_cache_free(mhandle_cache, mhandle); +} + +struct writequeue_entry *dlm_allocate_writequeue(void) +{ + return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC); +} + +void dlm_free_writequeue(struct writequeue_entry *writequeue) +{ + kmem_cache_free(writequeue_cache, writequeue); +} + +struct dlm_msg *dlm_allocate_msg(gfp_t allocation) +{ + return kmem_cache_alloc(msg_cache, allocation); +} + +void dlm_free_msg(struct dlm_msg *msg) +{ + kmem_cache_free(msg_cache, msg); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 4f218ea4b187..7bd3f1a391ca 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -20,6 +20,12 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); +struct dlm_mhandle *dlm_allocate_mhandle(void); +void dlm_free_mhandle(struct dlm_mhandle *mhandle); +struct writequeue_entry *dlm_allocate_writequeue(void); +void dlm_free_writequeue(struct writequeue_entry *writequeue); +struct dlm_msg *dlm_allocate_msg(gfp_t allocation); +void dlm_free_msg(struct dlm_msg *msg); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 7ae39ec8d9b0..3635e42b0669 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -137,6 +137,7 @@ #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" +#include "memory.h" #include "lock.h" #include "util.h" #include "midcomms.h" @@ -220,6 +221,12 @@ DEFINE_STATIC_SRCU(nodes_srcu); */ static DEFINE_MUTEX(close_lock); +struct kmem_cache *dlm_midcomms_cache_create(void) +{ + return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle), + 0, 0, NULL); +} + static inline const char *dlm_state_str(int state) { switch (state) { @@ -279,7 +286,7 @@ static void dlm_mhandle_release(struct rcu_head *rcu) struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); dlm_lowcomms_put_msg(mh->msg); - kfree(mh); + dlm_free_mhandle(mh); } static void dlm_mhandle_delete(struct midcomms_node *node, @@ -909,11 +916,11 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; - switch (le32_to_cpu(hd->h_version)) { - case DLM_VERSION_3_1: + switch (hd->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); break; - case DLM_VERSION_3_2: + case cpu_to_le32(DLM_VERSION_3_2): dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); break; default: @@ -969,7 +976,7 @@ void dlm_midcomms_receive_done(int nodeid) spin_unlock(&node->state_lock); /* do nothing FIN has it's own ack send */ break; - }; + } srcu_read_unlock(&nodes_srcu, idx); } @@ -1020,8 +1027,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, header_out(&opts->o_header); } -static void midcomms_new_msg_cb(struct dlm_mhandle *mh) +static void midcomms_new_msg_cb(void *data) { + struct dlm_mhandle *mh = data; + atomic_inc(&mh->node->send_queue_cnt); spin_lock(&mh->node->send_queue_lock); @@ -1071,10 +1080,12 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, /* this is a bug, however we going on and hope it will be resolved */ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); - mh = kzalloc(sizeof(*mh), GFP_NOFS); + mh = dlm_allocate_mhandle(); if (!mh) goto err; + mh->committed = false; + mh->ack_rcv = NULL; mh->idx = idx; mh->node = node; @@ -1083,7 +1094,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, NULL, NULL); if (!msg) { - kfree(mh); + dlm_free_mhandle(mh); goto err; } @@ -1092,13 +1103,13 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, ppc); if (!msg) { - kfree(mh); + dlm_free_mhandle(mh); goto err; } break; default: - kfree(mh); + dlm_free_mhandle(mh); WARN_ON(1); goto err; } @@ -1134,7 +1145,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) dlm_lowcomms_commit_msg(mh->msg); dlm_lowcomms_put_msg(mh->msg); /* mh is not part of rcu list in this case */ - kfree(mh); + dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: dlm_midcomms_commit_msg_3_2(mh); @@ -1231,7 +1242,7 @@ void dlm_midcomms_add_member(int nodeid) } node->users++; - pr_debug("users inc count %d\n", node->users); + pr_debug("node %d users inc count %d\n", nodeid, node->users); spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); @@ -1254,7 +1265,7 @@ void dlm_midcomms_remove_member(int nodeid) spin_lock(&node->state_lock); node->users--; - pr_debug("users dec count %d\n", node->users); + pr_debug("node %d users dec count %d\n", nodeid, node->users); /* hitting users count to zero means the * other side is running dlm_midcomms_stop() @@ -1425,3 +1436,51 @@ int dlm_midcomms_close(int nodeid) return ret; } + +/* debug functionality to send raw dlm msg from user space */ +struct dlm_rawmsg_data { + struct midcomms_node *node; + void *buf; +}; + +static void midcomms_new_rawmsg_cb(void *data) +{ + struct dlm_rawmsg_data *rd = data; + struct dlm_header *h = rd->buf; + + switch (h->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): + break; + default: + switch (h->h_cmd) { + case DLM_OPTS: + if (!h->u.h_seq) + h->u.h_seq = rd->node->seq_send++; + break; + default: + break; + } + break; + } +} + +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen) +{ + struct dlm_rawmsg_data rd; + struct dlm_msg *msg; + char *msgbuf; + + rd.node = node; + rd.buf = buf; + + msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS, + &msgbuf, midcomms_new_rawmsg_cb, &rd); + if (!msg) + return -ENOMEM; + + memcpy(msgbuf, buf, buflen); + dlm_lowcomms_commit_msg(msg); + return 0; +} + diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 579abc6929be..82bcd9661922 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -28,6 +28,9 @@ const char *dlm_midcomms_state(struct midcomms_node *node); unsigned long dlm_midcomms_flags(struct midcomms_node *node); int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); uint32_t dlm_midcomms_version(struct midcomms_node *node); +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen); +struct kmem_cache *dlm_midcomms_cache_create(void); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 6cba86470278..5821b777a1a7 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -601,7 +601,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) spin_lock(&ls->ls_recover_lock); status = ls->ls_recover_status; - stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + stop = dlm_recovery_stopped(ls); seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 97d052cea5a9..a55dfce705dd 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -124,8 +124,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_recover_waiters_pre(ls); - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; goto fail; } diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index e89e0ff8bfa3..ccb5307c21e9 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -44,6 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->nodeid = nodeid; memcpy(&e->request, ms, ms->m_header.h_length); + atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); list_add_tail(&e->list, &ls->ls_requestqueue); mutex_unlock(&ls->ls_requestqueue_mutex); @@ -89,6 +90,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls) mutex_lock(&ls->ls_requestqueue_mutex); list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); kfree(e); if (dlm_locking_stopped(ls)) { @@ -115,14 +118,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls) void dlm_wait_requestqueue(struct dlm_ls *ls) { - for (;;) { - mutex_lock(&ls->ls_requestqueue_mutex); - if (list_empty(&ls->ls_requestqueue)) - break; - mutex_unlock(&ls->ls_requestqueue_mutex); - schedule(); - } - mutex_unlock(&ls->ls_requestqueue_mutex); + wait_event(ls->ls_requestqueue_wait, + atomic_read(&ls->ls_requestqueue_cnt) == 0); } static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) @@ -130,7 +127,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) uint32_t type = ms->m_type; /* the ls is being cleaned up and freed by release_lockspace */ - if (!ls->ls_count) + if (!atomic_read(&ls->ls_count)) return 1; if (dlm_is_removed(ls, nodeid)) @@ -161,6 +158,8 @@ void dlm_purge_requestqueue(struct dlm_ls *ls) if (purge_request(ls, ms, e->nodeid)) { list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); kfree(e); } } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d66bbd2df191..2dd23a82e0de 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -537,7 +537,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (mnt_user_ns(path.mnt) != &init_user_ns) { + if (is_idmapped_mnt(path.mnt)) { rc = -EINVAL; printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n"); goto out_free; diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 756fe2d65272..8a3317e38e5a 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 579406504919..19e6c56a9f47 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -12,7 +12,7 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_out; + unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; /* indicate the algorithm will be used for decompression */ @@ -87,6 +87,8 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, return page->mapping == MNGD_MAPPING(sbi); } +int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, + unsigned int padbufsize); int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0e35ef3f9f3d..226a57c57ee6 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -9,37 +9,71 @@ #include <linux/dax.h> #include <trace/events/erofs.h> -struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) +void erofs_unmap_metabuf(struct erofs_buf *buf) +{ + if (buf->kmap_type == EROFS_KMAP) + kunmap(buf->page); + else if (buf->kmap_type == EROFS_KMAP_ATOMIC) + kunmap_atomic(buf->base); + buf->base = NULL; + buf->kmap_type = EROFS_NO_KMAP; +} + +void erofs_put_metabuf(struct erofs_buf *buf) +{ + if (!buf->page) + return; + erofs_unmap_metabuf(buf); + put_page(buf->page); + buf->page = NULL; +} + +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type) { struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; - struct page *page; - - page = read_cache_page_gfp(mapping, blkaddr, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - /* should already be PageUptodate */ - if (!IS_ERR(page)) - lock_page(page); - return page; + erofs_off_t offset = blknr_to_addr(blkaddr); + pgoff_t index = offset >> PAGE_SHIFT; + struct page *page = buf->page; + + if (!page || page->index != index) { + erofs_put_metabuf(buf); + page = read_cache_page_gfp(mapping, index, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(page)) + return page; + /* should already be PageUptodate, no need to lock page */ + buf->page = page; + } + if (buf->kmap_type == EROFS_NO_KMAP) { + if (type == EROFS_KMAP) + buf->base = kmap(page); + else if (type == EROFS_KMAP_ATOMIC) + buf->base = kmap_atomic(page); + buf->kmap_type = type; + } else if (buf->kmap_type != type) { + DBG_BUGON(1); + return ERR_PTR(-EFAULT); + } + if (type == EROFS_NO_KMAP) + return NULL; + return buf->base + (offset & ~PAGE_MASK); } static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map, int flags) { - int err = 0; erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - trace_erofs_map_blocks_flatmode_enter(inode, map, flags); - - nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); lastblk = nblocks - tailendpacking; /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; - if (offset < blknr_to_addr(lastblk)) { map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; map->m_plen = blknr_to_addr(lastblk) - offset; @@ -51,30 +85,23 @@ static int erofs_map_blocks_flatmode(struct inode *inode, vi->xattr_isize + erofs_blkoff(map->m_la); map->m_plen = inode->i_size - offset; - /* inline data should be located in one meta block */ - if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) { + /* inline data should be located in the same meta block */ + if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) { erofs_err(inode->i_sb, "inline data cross block boundary @ nid %llu", vi->nid); DBG_BUGON(1); - err = -EFSCORRUPTED; - goto err_out; + return -EFSCORRUPTED; } - map->m_flags |= EROFS_MAP_META; } else { erofs_err(inode->i_sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", vi->nid, inode->i_size, map->m_la); DBG_BUGON(1); - err = -EIO; - goto err_out; + return -EIO; } - - map->m_llen = map->m_plen; -err_out: - trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); - return err; + return 0; } static int erofs_map_blocks(struct inode *inode, @@ -83,12 +110,14 @@ static int erofs_map_blocks(struct inode *inode, struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; u64 chunknr; unsigned int unit; erofs_off_t pos; + void *kaddr; int err = 0; + trace_erofs_map_blocks_enter(inode, map, flags); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ @@ -97,8 +126,10 @@ static int erofs_map_blocks(struct inode *inode, goto out; } - if (vi->datalayout != EROFS_INODE_CHUNK_BASED) - return erofs_map_blocks_flatmode(inode, map, flags); + if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { + err = erofs_map_blocks_flatmode(inode, map, flags); + goto out; + } if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) unit = sizeof(*idx); /* chunk index */ @@ -109,17 +140,18 @@ static int erofs_map_blocks(struct inode *inode, pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos)); - if (IS_ERR(page)) - return PTR_ERR(page); - + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out; + } map->m_la = chunknr << vi->chunkbits; map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = page_address(page) + erofs_blkoff(pos); + __le32 *blkaddr = kaddr + erofs_blkoff(pos); if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; @@ -130,7 +162,7 @@ static int erofs_map_blocks(struct inode *inode, goto out_unlock; } /* parse chunk indexes */ - idx = page_address(page) + erofs_blkoff(pos); + idx = kaddr + erofs_blkoff(pos); switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; @@ -143,10 +175,11 @@ static int erofs_map_blocks(struct inode *inode, break; } out_unlock: - unlock_page(page); - put_page(page); + erofs_put_metabuf(&buf); out: - map->m_llen = map->m_plen; + if (!err) + map->m_llen = map->m_plen; + trace_erofs_map_blocks_exit(inode, map, flags, 0); return err; } @@ -159,6 +192,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) /* primary device by default */ map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; + map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; if (map->m_deviceid) { down_read(&devs->rwsem); @@ -169,6 +203,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) } map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; up_read(&devs->rwsem); } else if (devs->extra_devices) { down_read(&devs->rwsem); @@ -185,6 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_pa -= startoff; map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; break; } } @@ -215,9 +251,11 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret) return ret; - iomap->bdev = mdev.m_bdev; - iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; + if (flags & IOMAP_DAX) + iomap->dax_dev = mdev.m_daxdev; + else + iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; @@ -231,19 +269,21 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } if (map.m_flags & EROFS_MAP_META) { - struct page *ipage; + void *ptr; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(mdev.m_pa)); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - iomap->inline_data = page_address(ipage) + - erofs_blkoff(mdev.m_pa); - iomap->private = ipage; + ptr = erofs_read_metabuf(&buf, inode->i_sb, + erofs_blknr(mdev.m_pa), EROFS_KMAP); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa); + iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; + if (flags & IOMAP_DAX) + iomap->addr += mdev.m_dax_part_off; } return 0; } @@ -251,12 +291,17 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { - struct page *ipage = iomap->private; + void *ptr = iomap->private; + + if (ptr) { + struct erofs_buf buf = { + .page = kmap_to_page(ptr), + .base = ptr, + .kmap_type = EROFS_KMAP, + }; - if (ipage) { DBG_BUGON(iomap->type != IOMAP_INLINE); - unlock_page(ipage); - put_page(ipage); + erofs_put_metabuf(&buf); } else { DBG_BUGON(iomap->type == IOMAP_INLINE); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index bf37fc76b182..3efa686c7644 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,6 +16,14 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif +struct z_erofs_lz4_decompress_ctx { + struct z_erofs_decompress_req *rq; + /* # of encoded, decoded pages */ + unsigned int inpages, outpages; + /* decoded block total length (used for in-place decompression) */ + unsigned int oend; +}; + int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -56,11 +64,10 @@ int z_erofs_load_lz4_config(struct super_block *sb, * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ -static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, struct page **pagepool) { - const unsigned int nr = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; @@ -70,7 +77,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, unsigned int i, j, top; top = 0; - for (i = j = 0; i < nr; ++i, ++j) { + for (i = j = 0; i < ctx->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; @@ -112,41 +119,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, void *inpage, unsigned int *inputmargin, int *maptype, - bool support_0padding) + bool may_inplace) { - unsigned int nrpages_in, nrpages_out; - unsigned int ofull, oend, inputsize, total, i, j; + struct z_erofs_decompress_req *rq = ctx->rq; + unsigned int omargin, total, i, j; struct page **in; void *src, *tmp; - inputsize = rq->inputsize; - nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT; - oend = rq->pageofs_out + rq->outputsize; - ofull = PAGE_ALIGN(oend); - nrpages_out = ofull >> PAGE_SHIFT; - if (rq->inplace_io) { - if (rq->partial_decoding || !support_0padding || - ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize)) + omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; + if (rq->partial_decoding || !may_inplace || + omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < nrpages_in; ++i) { + for (i = 0; i < ctx->inpages; ++i) { DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < nrpages_out - nrpages_in + i; ++j) + for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) if (rq->out[j] == rq->in[i]) goto docopy; } } - if (nrpages_in <= 1) { + if (ctx->inpages <= 1) { *maptype = 0; return inpage; } kunmap_atomic(inpage); might_sleep(); - src = erofs_vm_map_ram(rq->in, nrpages_in); + src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; @@ -155,7 +157,7 @@ static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = erofs_get_pcpubuf(nrpages_in); + src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); kunmap_atomic(inpage); @@ -182,36 +184,53 @@ docopy: return src; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, +/* + * Get the exact inputsize with zero_padding feature. + * - For LZ4, it should work if zero_padding feature is on (5.3+); + * - For MicroLZMA, it'd be enabled all the time. + */ +int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, + unsigned int padbufsize) +{ + const char *padend; + + padend = memchr_inv(padbuf, 0, padbufsize); + if (!padend) + return -EFSCORRUPTED; + rq->inputsize -= padend - padbuf; + rq->pageofs_in += padend - padbuf; + return 0; +} + +static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, u8 *out) { + struct z_erofs_decompress_req *rq = ctx->rq; + bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *headpage, *src; - bool support_0padding; int ret, maptype; DBG_BUGON(*rq->in == NULL); headpage = kmap_atomic(*rq->in); - inputmargin = 0; - support_0padding = false; - /* decompression inplace is only safe when 0padding is enabled */ - if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) { + /* LZ4 decompression inplace is only safe if zero_padding is enabled */ + if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { support_0padding = true; - - while (!headpage[inputmargin & ~PAGE_MASK]) - if (!(++inputmargin & ~PAGE_MASK)) - break; - - if (inputmargin >= rq->inputsize) { + ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + EROFS_BLKSIZ - rq->pageofs_in)); + if (ret) { kunmap_atomic(headpage); - return -EIO; + return ret; } + may_inplace = !((rq->pageofs_in + rq->inputsize) & + (EROFS_BLKSIZ - 1)); } - rq->inputsize -= inputmargin; - src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, - &maptype, support_0padding); + inputmargin = rq->pageofs_in; + src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); @@ -240,9 +259,9 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, } if (maptype == 0) { - kunmap_atomic(src); + kunmap_atomic(headpage); } else if (maptype == 1) { - vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT); + vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); } else { @@ -255,14 +274,18 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; + ctx.rq = rq; + ctx.oend = rq->pageofs_out + rq->outputsize; + ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; + ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + /* one optimized fast path only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_atomic(*rq->out); dst_maptype = 0; @@ -270,27 +293,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } /* general decoding path which can be used for all cases */ - ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); - if (ret < 0) + ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); + if (ret < 0) { return ret; - if (ret) { + } else if (ret > 0) { dst = page_address(*rq->out); dst_maptype = 1; - goto dstmap_out; + } else { + dst = erofs_vm_map_ram(rq->out, ctx.outpages); + if (!dst) + return -ENOMEM; + dst_maptype = 2; } - dst = erofs_vm_map_ram(rq->out, nrpages_out); - if (!dst) - return -ENOMEM; - dst_maptype = 2; - dstmap_out: - ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); - + ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); else if (dst_maptype == 2) - vm_unmap_ram(dst, nrpages_out); + vm_unmap_ram(dst, ctx.outpages); return ret; } @@ -299,7 +320,8 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out; + const unsigned int righthalf = min_t(unsigned int, rq->outputsize, + PAGE_SIZE - rq->pageofs_out); unsigned char *src, *dst; if (nrpages_out > 2) { @@ -312,7 +334,7 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, return 0; } - src = kmap_atomic(*rq->in); + src = kmap_atomic(*rq->in) + rq->pageofs_in; if (rq->out[0]) { dst = kmap_atomic(rq->out[0]); memcpy(dst + rq->pageofs_out, src, righthalf); diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 50045510a1f4..05a3063cf2bc 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -156,7 +156,7 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int nrpages_in = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - unsigned int inputmargin, inlen, outlen, pageofs; + unsigned int inlen, outlen, pageofs; struct z_erofs_lzma *strm; u8 *kin; bool bounced = false; @@ -164,16 +164,13 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, /* 1. get the exact LZMA compressed size */ kin = kmap(*rq->in); - inputmargin = 0; - while (!kin[inputmargin & ~PAGE_MASK]) - if (!(++inputmargin & ~PAGE_MASK)) - break; - - if (inputmargin >= PAGE_SIZE) { + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + EROFS_BLKSIZ - rq->pageofs_in)); + if (err) { kunmap(*rq->in); - return -EFSCORRUPTED; + return err; } - rq->inputsize -= inputmargin; /* 2. get an available lzma context */ again: @@ -193,9 +190,9 @@ again: xz_dec_microlzma_reset(strm->state, inlen, outlen, !rq->partial_decoding); pageofs = rq->pageofs_out; - strm->buf.in = kin + inputmargin; + strm->buf.in = kin + rq->pageofs_in; strm->buf.in_pos = 0; - strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in); inlen -= strm->buf.in_size; strm->buf.out = NULL; strm->buf.out_pos = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 083997a034e5..3ea62c6fb00a 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -17,19 +17,21 @@ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should * be incompatible with this kernel version. */ -#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 +#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 +#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ + (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ + EROFS_FEATURE_INCOMPAT_ZTAILPACKING) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -209,7 +211,7 @@ struct erofs_xattr_ibody_header { __le32 h_reserved; __u8 h_shared_count; __u8 h_reserved2[7]; - __le32 h_shared_xattrs[0]; /* shared xattr id array */ + __le32 h_shared_xattrs[]; /* shared xattr id array */ }; /* Name indexes */ @@ -226,7 +228,7 @@ struct erofs_xattr_entry { __u8 e_name_index; /* attribute name index */ __le16 e_value_size; /* size of attribute value */ /* followed by e_name and e_value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) @@ -292,13 +294,17 @@ struct z_erofs_lzma_cfgs { * (4B) + 2B + (4B) if compacted 2B is on. * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) + * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 +#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 struct z_erofs_map_header { - __le32 h_reserved1; + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 2345f1de438e..ff62f84f47d3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -13,8 +13,8 @@ * the inode payload page if it's an extended inode) in order to fill * inline data if possible. */ -static struct page *erofs_read_inode(struct inode *inode, - unsigned int *ofs) +static void *erofs_read_inode(struct erofs_buf *buf, + struct inode *inode, unsigned int *ofs) { struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -22,7 +22,7 @@ static struct page *erofs_read_inode(struct inode *inode, const erofs_off_t inode_loc = iloc(sbi, vi->nid); erofs_blk_t blkaddr, nblks = 0; - struct page *page; + void *kaddr; struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; unsigned int ifmt; @@ -34,14 +34,14 @@ static struct page *erofs_read_inode(struct inode *inode, erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", __func__, vi->nid, *ofs, blkaddr); - page = erofs_get_meta_page(sb, blkaddr); - if (IS_ERR(page)) { + kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); + if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(page)); - return page; + vi->nid, PTR_ERR(kaddr)); + return kaddr; } - dic = page_address(page) + *ofs; + dic = kaddr + *ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { @@ -62,12 +62,12 @@ static struct page *erofs_read_inode(struct inode *inode, switch (erofs_inode_version(ifmt)) { case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); - /* check if the inode acrosses page boundary */ - if (*ofs + vi->inode_isize <= PAGE_SIZE) { + /* check if the extended inode acrosses block boundary */ + if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) { *ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = PAGE_SIZE - *ofs; + const unsigned int gotten = EROFS_BLKSIZ - *ofs; copied = kmalloc(vi->inode_isize, GFP_NOFS); if (!copied) { @@ -75,18 +75,16 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } memcpy(copied, dic, gotten); - unlock_page(page); - put_page(page); - - page = erofs_get_meta_page(sb, blkaddr + 1); - if (IS_ERR(page)) { - erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld", - vi->nid, PTR_ERR(page)); + kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1, + EROFS_KMAP); + if (IS_ERR(kaddr)) { + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", + vi->nid, PTR_ERR(kaddr)); kfree(copied); - return page; + return kaddr; } *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, page_address(page), *ofs); + memcpy((u8 *)copied + gotten, kaddr, *ofs); die = copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); @@ -200,7 +198,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; else inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; - return page; + return kaddr; bogusimode: erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", @@ -209,12 +207,11 @@ bogusimode: err_out: DBG_BUGON(1); kfree(copied); - unlock_page(page); - put_page(page); + erofs_put_metabuf(buf); return ERR_PTR(err); } -static int erofs_fill_symlink(struct inode *inode, void *data, +static int erofs_fill_symlink(struct inode *inode, void *kaddr, unsigned int m_pofs) { struct erofs_inode *vi = EROFS_I(inode); @@ -222,7 +219,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= PAGE_SIZE) { + inode->i_size >= EROFS_BLKSIZ) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -232,8 +229,8 @@ static int erofs_fill_symlink(struct inode *inode, void *data, return -ENOMEM; m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross page boundary as well */ - if (m_pofs + inode->i_size > PAGE_SIZE) { + /* inline symlink data shouldn't cross block boundary */ + if (m_pofs + inode->i_size > EROFS_BLKSIZ) { kfree(lnk); erofs_err(inode->i_sb, "inline data cross block boundary @ nid %llu", @@ -241,8 +238,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, DBG_BUGON(1); return -EFSCORRUPTED; } - - memcpy(lnk, data + m_pofs, inode->i_size); + memcpy(lnk, kaddr + m_pofs, inode->i_size); lnk[inode->i_size] = '\0'; inode->i_link = lnk; @@ -253,16 +249,17 @@ static int erofs_fill_symlink(struct inode *inode, void *data, static int erofs_fill_inode(struct inode *inode, int isdir) { struct erofs_inode *vi = EROFS_I(inode); - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; unsigned int ofs; int err = 0; trace_erofs_fill_inode(inode, isdir); /* read inode base data from disk */ - page = erofs_read_inode(inode, &ofs); - if (IS_ERR(page)) - return PTR_ERR(page); + kaddr = erofs_read_inode(&buf, inode, &ofs); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -278,7 +275,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) inode->i_fop = &erofs_dir_fops; break; case S_IFLNK: - err = erofs_fill_symlink(inode, page_address(page), ofs); + err = erofs_fill_symlink(inode, kaddr, ofs); if (err) goto out_unlock; inode_nohighmem(inode); @@ -302,8 +299,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) inode->i_mapping->a_ops = &erofs_raw_access_aops; out_unlock: - unlock_page(page); - put_page(page); + erofs_put_metabuf(&buf); return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3265688af7f9..b8272fb95fd6 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -51,17 +51,24 @@ struct erofs_device_info { char *path; struct block_device *bdev; struct dax_device *dax_dev; + u64 dax_part_off; u32 blocks; u32 mapped_blkaddr; }; +enum { + EROFS_SYNC_DECOMPRESS_AUTO, + EROFS_SYNC_DECOMPRESS_FORCE_ON, + EROFS_SYNC_DECOMPRESS_FORCE_OFF +}; + struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; - /* strategy of sync decompression (false - auto, true - force on) */ - bool readahead_sync_decompress; + /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ + unsigned int sync_decompress; /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; @@ -109,6 +116,7 @@ struct erofs_sb_info { #endif /* CONFIG_EROFS_FS_ZIP */ struct erofs_dev_context *devs; struct dax_device *dax_dev; + u64 dax_part_off; u64 total_blocks; u32 primarydevice_blocks; @@ -134,6 +142,10 @@ struct erofs_sb_info { u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; + + /* sysfs support */ + struct kobject s_kobj; /* /sys/fs/erofs/<devname> */ + struct completion s_kobj_unregister; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -241,6 +253,19 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) #error erofs cannot be used in this platform #endif +enum erofs_kmap_type { + EROFS_NO_KMAP, /* don't map the buffer */ + EROFS_KMAP, /* use kmap() to map the buffer */ + EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */ +}; + +struct erofs_buf { + struct page *page; + void *base; + enum erofs_kmap_type kmap_type; +}; +#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) + #define ROOT_NID(sb) ((sb)->root_nid) #define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) @@ -258,10 +283,13 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ return sbi->feature_##compat & EROFS_FEATURE_##feature; \ } -EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) +EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) +EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) +EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -296,6 +324,9 @@ struct erofs_inode { unsigned short z_advise; unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; + unsigned long z_tailextent_headlcn; + unsigned int z_idataoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -390,14 +421,14 @@ enum { #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) struct erofs_map_blocks { + struct erofs_buf buf; + erofs_off_t m_pa, m_la; u64 m_plen, m_llen; unsigned short m_deviceid; char m_algorithmformat; unsigned int m_flags; - - struct page *mpage; }; /* Flags used by erofs_map_blocks_flatmode() */ @@ -409,6 +440,8 @@ struct erofs_map_blocks { #define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* Used to map the whole extent if non-negligible data is requested for LZMA */ #define EROFS_GET_BLOCKS_READMORE 0x0004 +/* Used to map tail extent for tailpacking inline pcluster */ +#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -436,6 +469,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_dev { struct block_device *m_bdev; struct dax_device *m_daxdev; + u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; @@ -443,7 +477,10 @@ struct erofs_map_dev { /* data.c */ extern const struct file_operations erofs_file_fops; -struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +void erofs_unmap_metabuf(struct erofs_buf *buf); +void erofs_put_metabuf(struct erofs_buf *buf); +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -498,6 +535,12 @@ int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); +/* sysfs.c */ +int erofs_register_sysfs(struct super_block *sb); +void erofs_unregister_sysfs(struct super_block *sb); +int __init erofs_init_sysfs(void); +void erofs_exit_sysfs(void); + /* utils.c / zdata.c */ struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); static inline void erofs_pagepool_add(struct page **pagepool, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 6a969b1e0ee6..915eefe0d7e2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include <linux/module.h> #include <linux/buffer_head.h> @@ -124,80 +125,50 @@ static bool check_layout_compatibility(struct super_block *sb, #ifdef CONFIG_EROFS_FS_ZIP /* read variable-sized metadata, offset will be aligned by 4-byte */ -static void *erofs_read_metadata(struct super_block *sb, struct page **pagep, +static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) { - struct page *page = *pagep; u8 *buffer, *ptr; int len, i, cnt; - erofs_blk_t blk; *offset = round_up(*offset, 4); - blk = erofs_blknr(*offset); + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP); + if (IS_ERR(ptr)) + return ptr; - if (!page || page->index != blk) { - if (page) { - unlock_page(page); - put_page(page); - } - page = erofs_get_meta_page(sb, blk); - if (IS_ERR(page)) - goto err_nullpage; - } - - ptr = kmap(page); len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); - if (!buffer) { - buffer = ERR_PTR(-ENOMEM); - goto out; - } + if (!buffer) + return ERR_PTR(-ENOMEM); *offset += sizeof(__le16); *lengthp = len; for (i = 0; i < len; i += cnt) { cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); - blk = erofs_blknr(*offset); - - if (!page || page->index != blk) { - if (page) { - kunmap(page); - unlock_page(page); - put_page(page); - } - page = erofs_get_meta_page(sb, blk); - if (IS_ERR(page)) { - kfree(buffer); - goto err_nullpage; - } - ptr = kmap(page); + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), + EROFS_KMAP); + if (IS_ERR(ptr)) { + kfree(buffer); + return ptr; } memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); *offset += cnt; } -out: - kunmap(page); - *pagep = page; return buffer; -err_nullpage: - *pagep = NULL; - return page; } static int erofs_load_compr_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { - struct erofs_sb_info *sbi; - struct page *page; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; unsigned int algs, alg; erofs_off_t offset; - int size, ret; + int size, ret = 0; - sbi = EROFS_SB(sb); sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); - if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { erofs_err(sb, "try to load compressed fs with unsupported algorithms %x", sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); @@ -205,20 +176,17 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } offset = EROFS_SUPER_OFFSET + sbi->sb_size; - page = NULL; alg = 0; - ret = 0; - for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { void *data; if (!(algs & 1)) continue; - data = erofs_read_metadata(sb, &page, &offset, &size); + data = erofs_read_metadata(sb, &buf, &offset, &size); if (IS_ERR(data)) { ret = PTR_ERR(data); - goto err; + break; } switch (alg) { @@ -234,13 +202,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } kfree(data); if (ret) - goto err; - } -err: - if (page) { - unlock_page(page); - put_page(page); + break; } + erofs_put_metabuf(&buf); return ret; } #else @@ -261,7 +225,7 @@ static int erofs_init_devices(struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); unsigned int ondisk_extradevs; erofs_off_t pos; - struct page *page = NULL; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_device_info *dif; struct erofs_deviceslot *dis; void *ptr; @@ -285,22 +249,13 @@ static int erofs_init_devices(struct super_block *sb, pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); idr_for_each_entry(&sbi->devs->tree, dif, id) { - erofs_blk_t blk = erofs_blknr(pos); struct block_device *bdev; - if (!page || page->index != blk) { - if (page) { - kunmap(page); - unlock_page(page); - put_page(page); - } - - page = erofs_get_meta_page(sb, blk); - if (IS_ERR(page)) { - up_read(&sbi->devs->rwsem); - return PTR_ERR(page); - } - ptr = kmap(page); + ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), + EROFS_KMAP); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + break; } dis = ptr + erofs_blkoff(pos); @@ -309,22 +264,17 @@ static int erofs_init_devices(struct super_block *sb, sb->s_type); if (IS_ERR(bdev)) { err = PTR_ERR(bdev); - goto err_out; + break; } dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); dif->blocks = le32_to_cpu(dis->blocks); dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); sbi->total_blocks += dif->blocks; pos += EROFS_DEVT_SLOT_SIZE; } -err_out: up_read(&sbi->devs->rwsem); - if (page) { - kunmap(page); - unlock_page(page); - put_page(page); - } + erofs_put_metabuf(&buf); return err; } @@ -411,6 +361,9 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_init_devices(sb, dsb); + + if (erofs_sb_has_ztailpacking(sbi)) + erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); out: kunmap(page); put_page(page); @@ -423,7 +376,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) #ifdef CONFIG_EROFS_FS_ZIP ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; ctx->opt.max_sync_decompress_pages = 3; - ctx->opt.readahead_sync_decompress = false; + ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(&ctx->opt, XATTR_USER); @@ -644,7 +597,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off); sbi->devs = ctx->devs; ctx->devs = NULL; @@ -652,10 +605,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - if (test_opt(&sbi->opt, DAX_ALWAYS) && - !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { - errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(&sbi->opt, DAX_ALWAYS); + if (test_opt(&sbi->opt, DAX_ALWAYS)) { + BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE); + + if (!sbi->dax_dev) { + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); + clear_opt(&sbi->opt, DAX_ALWAYS); + } } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -695,6 +651,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + err = erofs_register_sysfs(sb); + if (err) + return err; + erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); return 0; } @@ -808,6 +768,7 @@ static void erofs_put_super(struct super_block *sb) DBG_BUGON(!sbi); + erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); @@ -852,6 +813,10 @@ static int __init erofs_module_init(void) if (err) goto zip_err; + err = erofs_init_sysfs(); + if (err) + goto sysfs_err; + err = register_filesystem(&erofs_fs_type); if (err) goto fs_err; @@ -859,6 +824,8 @@ static int __init erofs_module_init(void) return 0; fs_err: + erofs_exit_sysfs(); +sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: z_erofs_lzma_exit(); @@ -877,6 +844,7 @@ static void __exit erofs_module_exit(void) /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); z_erofs_lzma_exit(); erofs_exit_shrinker(); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c new file mode 100644 index 000000000000..dac252bc9228 --- /dev/null +++ b/fs/erofs/sysfs.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd. + * https://www.oppo.com/ + */ +#include <linux/sysfs.h> +#include <linux/kobject.h> + +#include "internal.h" + +enum { + attr_feature, + attr_pointer_ui, + attr_pointer_bool, +}; + +enum { + struct_erofs_sb_info, + struct_erofs_mount_opts, +}; + +struct erofs_attr { + struct attribute attr; + short attr_id; + int struct_type, offset; +}; + +#define EROFS_ATTR(_name, _mode, _id) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} +#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name) +#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature) + +#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \ +static struct erofs_attr erofs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .struct_type = struct_##_struct, \ + .offset = offsetof(struct _struct, _name),\ +} + +#define EROFS_ATTR_RW(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0644, _id, _struct) + +#define EROFS_RO_ATTR(_name, _id, _struct) \ + EROFS_ATTR_OFFSET(_name, 0444, _id, _struct) + +#define EROFS_ATTR_RW_UI(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_ui, _struct) + +#define EROFS_ATTR_RW_BOOL(_name, _struct) \ + EROFS_ATTR_RW(_name, pointer_bool, _struct) + +#define ATTR_LIST(name) (&erofs_attr_##name.attr) + +#ifdef CONFIG_EROFS_FS_ZIP +EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +#endif + +static struct attribute *erofs_attrs[] = { +#ifdef CONFIG_EROFS_FS_ZIP + ATTR_LIST(sync_decompress), +#endif + NULL, +}; +ATTRIBUTE_GROUPS(erofs); + +/* Features this copy of erofs supports */ +EROFS_ATTR_FEATURE(zero_padding); +EROFS_ATTR_FEATURE(compr_cfgs); +EROFS_ATTR_FEATURE(big_pcluster); +EROFS_ATTR_FEATURE(chunked_file); +EROFS_ATTR_FEATURE(device_table); +EROFS_ATTR_FEATURE(compr_head2); +EROFS_ATTR_FEATURE(sb_chksum); +EROFS_ATTR_FEATURE(ztailpacking); + +static struct attribute *erofs_feat_attrs[] = { + ATTR_LIST(zero_padding), + ATTR_LIST(compr_cfgs), + ATTR_LIST(big_pcluster), + ATTR_LIST(chunked_file), + ATTR_LIST(device_table), + ATTR_LIST(compr_head2), + ATTR_LIST(sb_chksum), + ATTR_LIST(ztailpacking), + NULL, +}; +ATTRIBUTE_GROUPS(erofs_feat); + +static unsigned char *__struct_ptr(struct erofs_sb_info *sbi, + int struct_type, int offset) +{ + if (struct_type == struct_erofs_sb_info) + return (unsigned char *)sbi + offset; + if (struct_type == struct_erofs_mount_opts) + return (unsigned char *)&sbi->opt + offset; + return NULL; +} + +static ssize_t erofs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + + switch (a->attr_id) { + case attr_feature: + return sysfs_emit(buf, "supported\n"); + case attr_pointer_ui: + if (!ptr) + return 0; + return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr); + case attr_pointer_bool: + if (!ptr) + return 0; + return sysfs_emit(buf, "%d\n", *(bool *)ptr); + } + return 0; +} + +static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + struct erofs_attr *a = container_of(attr, struct erofs_attr, attr); + unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset); + unsigned long t; + int ret; + + switch (a->attr_id) { + case attr_pointer_ui: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != (unsigned int)t) + return -ERANGE; +#ifdef CONFIG_EROFS_FS_ZIP + if (!strcmp(a->attr.name, "sync_decompress") && + (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) + return -EINVAL; +#endif + *(unsigned int *)ptr = t; + return len; + case attr_pointer_bool: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t != 0 && t != 1) + return -EINVAL; + *(bool *)ptr = !!t; + return len; + } + return 0; +} + +static void erofs_sb_release(struct kobject *kobj) +{ + struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops erofs_attr_ops = { + .show = erofs_attr_show, + .store = erofs_attr_store, +}; + +static struct kobj_type erofs_sb_ktype = { + .default_groups = erofs_groups, + .sysfs_ops = &erofs_attr_ops, + .release = erofs_sb_release, +}; + +static struct kobj_type erofs_ktype = { + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kset erofs_root = { + .kobj = {.ktype = &erofs_ktype}, +}; + +static struct kobj_type erofs_feat_ktype = { + .default_groups = erofs_feat_groups, + .sysfs_ops = &erofs_attr_ops, +}; + +static struct kobject erofs_feat = { + .kset = &erofs_root, +}; + +int erofs_register_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int err; + + sbi->s_kobj.kset = &erofs_root; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + goto put_sb_kobj; + return 0; + +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; +} + +void erofs_unregister_sysfs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} + +int __init erofs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&erofs_root.kobj, "erofs"); + erofs_root.kobj.parent = fs_kobj; + ret = kset_register(&erofs_root); + if (ret) + goto root_err; + + ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, + NULL, "features"); + if (ret) + goto feat_err; + return ret; + +feat_err: + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +root_err: + return ret; +} + +void erofs_exit_sysfs(void) +{ + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +} diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 01c581e93c5f..8106bcb5a38d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -2,39 +2,20 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> #include "xattr.h" struct xattr_iter { struct super_block *sb; - struct page *page; + struct erofs_buf buf; void *kaddr; erofs_blk_t blkaddr; unsigned int ofs; }; -static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) -{ - /* the only user of kunmap() is 'init_inode_xattrs' */ - if (!atomic) - kunmap(it->page); - else - kunmap_atomic(it->kaddr); - - unlock_page(it->page); - put_page(it->page); -} - -static inline void xattr_iter_end_final(struct xattr_iter *it) -{ - if (!it->page) - return; - - xattr_iter_end(it, true); -} - static int init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -43,7 +24,6 @@ static int init_inode_xattrs(struct inode *inode) struct erofs_xattr_ibody_header *ih; struct super_block *sb; struct erofs_sb_info *sbi; - bool atomic_map; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -91,26 +71,23 @@ static int init_inode_xattrs(struct inode *inode) sb = inode->i_sb; sbi = EROFS_SB(sb); + it.buf = __EROFS_BUF_INITIALIZER; it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); - it.page = erofs_get_meta_page(sb, it.blkaddr); - if (IS_ERR(it.page)) { - ret = PTR_ERR(it.page); + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + ret = PTR_ERR(it.kaddr); goto out_unlock; } - /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = kmap(it.page); - atomic_map = false; - ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); - vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); if (!vi->xattr_shared_xattrs) { - xattr_iter_end(&it, atomic_map); + erofs_put_metabuf(&it.buf); ret = -ENOMEM; goto out_unlock; } @@ -122,25 +99,22 @@ static int init_inode_xattrs(struct inode *inode) if (it.ofs >= EROFS_BLKSIZ) { /* cannot be unaligned */ DBG_BUGON(it.ofs != EROFS_BLKSIZ); - xattr_iter_end(&it, atomic_map); - it.page = erofs_get_meta_page(sb, ++it.blkaddr); - if (IS_ERR(it.page)) { + it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, + EROFS_KMAP); + if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.page); + ret = PTR_ERR(it.kaddr); goto out_unlock; } - - it.kaddr = kmap_atomic(it.page); - atomic_map = true; it.ofs = 0; } vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); it.ofs += sizeof(__le32); } - xattr_iter_end(&it, atomic_map); + erofs_put_metabuf(&it.buf); /* paired with smp_mb() at the beginning of the function. */ smp_mb(); @@ -172,19 +146,11 @@ static inline int xattr_iter_fixup(struct xattr_iter *it) if (it->ofs < EROFS_BLKSIZ) return 0; - xattr_iter_end(it, true); - it->blkaddr += erofs_blknr(it->ofs); - - it->page = erofs_get_meta_page(it->sb, it->blkaddr); - if (IS_ERR(it->page)) { - int err = PTR_ERR(it->page); - - it->page = NULL; - return err; - } - - it->kaddr = kmap_atomic(it->page); + it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); it->ofs = erofs_blkoff(it->ofs); return 0; } @@ -207,11 +173,10 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr); - if (IS_ERR(it->page)) - return PTR_ERR(it->page); - - it->kaddr = kmap_atomic(it->page); + it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); return vi->xattr_isize - xattr_header_sz; } @@ -272,7 +237,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, entry.e_name_len - processed); /* handle name */ @@ -307,7 +272,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, value_sz - processed); op->value(it, processed, it->kaddr + it->ofs, slice); it->ofs += slice; @@ -386,8 +351,6 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) if (ret != -ENOATTR) break; } - xattr_iter_end_final(&it->it); - return ret ? ret : it->buffer_size; } @@ -404,26 +367,16 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - - if (!i || blkaddr != it->it.blkaddr) { - if (i) - xattr_iter_end(&it->it, true); - - it->it.page = erofs_get_meta_page(sb, blkaddr); - if (IS_ERR(it->it.page)) - return PTR_ERR(it->it.page); - - it->it.kaddr = kmap_atomic(it->it.page); - it->it.blkaddr = blkaddr; - } + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->it.kaddr)) + return PTR_ERR(it->it.kaddr); + it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); if (ret != -ENOATTR) break; } - if (vi->xattr_shared_count) - xattr_iter_end_final(&it->it); - return ret ? ret : it->buffer_size; } @@ -452,10 +405,11 @@ int erofs_getxattr(struct inode *inode, int index, return ret; it.index = index; - it.name.len = strlen(name); if (it.name.len > EROFS_NAME_LEN) return -ERANGE; + + it.it.buf = __EROFS_BUF_INITIALIZER; it.name.name = name; it.buffer = buffer; @@ -465,6 +419,7 @@ int erofs_getxattr(struct inode *inode, int index, ret = inline_getxattr(inode, &it); if (ret == -ENOATTR) ret = shared_getxattr(inode, &it); + erofs_put_metabuf(&it.it.buf); return ret; } @@ -607,7 +562,6 @@ static int inline_listxattr(struct listxattr_iter *it) if (ret) break; } - xattr_iter_end_final(&it->it); return ret ? ret : it->buffer_ofs; } @@ -625,25 +579,16 @@ static int shared_listxattr(struct listxattr_iter *it) xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - if (!i || blkaddr != it->it.blkaddr) { - if (i) - xattr_iter_end(&it->it, true); - - it->it.page = erofs_get_meta_page(sb, blkaddr); - if (IS_ERR(it->it.page)) - return PTR_ERR(it->it.page); - - it->it.kaddr = kmap_atomic(it->it.page); - it->it.blkaddr = blkaddr; - } + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, + EROFS_KMAP_ATOMIC); + if (IS_ERR(it->it.kaddr)) + return PTR_ERR(it->it.kaddr); + it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); if (ret) break; } - if (vi->xattr_shared_count) - xattr_iter_end_final(&it->it); - return ret ? ret : it->buffer_ofs; } @@ -659,6 +604,7 @@ ssize_t erofs_listxattr(struct dentry *dentry, if (ret) return ret; + it.it.buf = __EROFS_BUF_INITIALIZER; it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; @@ -667,9 +613,10 @@ ssize_t erofs_listxattr(struct dentry *dentry, it.it.sb = dentry->d_sb; ret = inline_listxattr(&it); - if (ret < 0 && ret != -ENOATTR) - return ret; - return shared_listxattr(&it); + if (ret >= 0 || ret == -ENOATTR) + ret = shared_listxattr(&it); + erofs_put_metabuf(&it.it.buf); + return ret; } #ifdef CONFIG_EROFS_FS_POSIX_ACL diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 94090c74b3f7..332462c59f11 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -86,4 +86,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #endif #endif - diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 9a249bfc2770..423bc1a61da5 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -82,12 +82,13 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) { + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; - if (pcl->pclusterpages > pcs->maxpages) + if (pclusterpages > pcs->maxpages) continue; kmem_cache_free(pcs->slab, pcl); @@ -298,6 +299,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, container_of(grp, struct z_erofs_pcluster, obj); int i; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. @@ -331,6 +333,7 @@ int erofs_try_to_free_cached_page(struct page *page) if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { unsigned int i; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { if (pcl->compressed_pages[i] == page) { WRITE_ONCE(pcl->compressed_pages[i], NULL); @@ -458,6 +461,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { + bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; struct erofs_workgroup *grp; @@ -469,12 +473,12 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : + map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) return PTR_ERR(pcl); atomic_set(&pcl->obj.refcount, 1); - pcl->obj.index = map->m_pa >> PAGE_SHIFT; pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? @@ -494,16 +498,25 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, mutex_init(&cl->lock); DBG_BUGON(!mutex_trylock(&cl->lock)); - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; - } + if (ztailpacking) { + pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->pageofs_in = erofs_blkoff(map->m_pa); + pcl->tailpacking_size = map->m_plen; + } else { + pcl->obj.index = map->m_pa >> PAGE_SHIFT; - if (grp != &pcl->obj) { - clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); - err = -EEXIST; - goto err_out; + grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto err_out; + } + + if (grp != &pcl->obj) { + clt->pcl = container_of(grp, + struct z_erofs_pcluster, obj); + err = -EEXIST; + goto err_out; + } } /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -532,17 +545,20 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (!PAGE_ALIGNED(map->m_pa)) { - DBG_BUGON(1); - return -EINVAL; + if (map->m_flags & EROFS_MAP_META) { + if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + goto tailpacking; } grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { +tailpacking: ret = z_erofs_register_collection(clt, inode, map); - if (!ret) goto out; if (ret != -EEXIST) @@ -558,9 +574,9 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, out: z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, clt->cl->pagevec, clt->cl->vcnt); - /* since file-backed online pages are traversed in reverse order */ - clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages; + clt->icpage_ptr = clt->pcl->compressed_pages + + z_erofs_pclusterpages(clt->pcl); return 0; } @@ -681,14 +697,31 @@ restart_now: if (err) goto err_out; - /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) - cache_strategy = TRYALLOC; - else - cache_strategy = DONTALLOC; + if (z_erofs_is_inline_pcluster(clt->pcl)) { + void *mp; + + mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, + erofs_blknr(map->m_pa), EROFS_NO_KMAP); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + erofs_err(inode->i_sb, + "failed to get inline page, err %d", err); + goto err_out; + } + get_page(fe->map.buf.page); + WRITE_ONCE(clt->pcl->compressed_pages[0], fe->map.buf.page); + clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + } else { + /* preload all compressed pages (can change mode if needed) */ + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, + map->m_la)) + cache_strategy = TRYALLOC; + else + cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), + cache_strategy, pagepool); + } hitted: /* @@ -762,32 +795,19 @@ err_out: goto out; } -static void z_erofs_decompressqueue_work(struct work_struct *work); -static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) +static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, + unsigned int readahead_pages) { - struct erofs_sb_info *const sbi = EROFS_SB(io->sb); - - /* wake up the caller thread for sync decompression */ - if (sync) { - unsigned long flags; + /* auto: enable for readpage, disable for readahead */ + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && + !readahead_pages) + return true; - spin_lock_irqsave(&io->u.wait.lock, flags); - if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); - return; - } + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && + (readahead_pages <= sbi->opt.max_sync_decompress_pages)) + return true; - if (atomic_add_return(bios, &io->pending_bios)) - return; - /* Use workqueue and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { - queue_work(z_erofs_workqueue, &io->u.work); - sbi->opt.readahead_sync_decompress = true; - return; - } - z_erofs_decompressqueue_work(&io->u.work); + return false; } static bool z_erofs_page_is_invalidated(struct page *page) @@ -795,38 +815,12 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static void z_erofs_decompressqueue_endio(struct bio *bio) -{ - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); - blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(z_erofs_page_is_invalidated(page)); - - if (err) - SetPageError(page); - - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } - } - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); - bio_put(bio); -} - static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct z_erofs_pagevec_ctor ctor; unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; @@ -908,15 +902,20 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, overlapped = false; compressed_pages = pcl->compressed_pages; - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { unsigned int pagenr; page = compressed_pages[i]; - /* all compressed pages ought to be valid */ DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); + if (z_erofs_is_inline_pcluster(pcl)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } + + DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { if (erofs_page_is_managed(sbi, page)) { if (!PageUptodate(page)) @@ -961,11 +960,16 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, partial = true; } - inputsize = pcl->pclusterpages * PAGE_SIZE; + if (z_erofs_is_inline_pcluster(pcl)) + inputsize = pcl->tailpacking_size; + else + inputsize = pclusterpages * PAGE_SIZE; + err = z_erofs_decompress(&(struct z_erofs_decompress_req) { .sb = sb, .in = compressed_pages, .out = pages, + .pageofs_in = pcl->pageofs_in, .pageofs_out = cl->pageofs, .inputsize = inputsize, .outputsize = outputsize, @@ -975,17 +979,22 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, }, pagepool); out: - /* must handle all compressed pages before ending pages */ - for (i = 0; i < pcl->pclusterpages; ++i) { - page = compressed_pages[i]; - - if (erofs_page_is_managed(sbi, page)) - continue; + /* must handle all compressed pages before actual file pages */ + if (z_erofs_is_inline_pcluster(pcl)) { + page = compressed_pages[0]; + WRITE_ONCE(compressed_pages[0], NULL); + put_page(page); + } else { + for (i = 0; i < pclusterpages; ++i) { + page = compressed_pages[i]; - /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); + if (erofs_page_is_managed(sbi, page)) + continue; - WRITE_ONCE(compressed_pages[i], NULL); + /* recycle all individual short-lived pages */ + (void)z_erofs_put_shortlivedpage(pagepool, page); + WRITE_ONCE(compressed_pages[i], NULL); + } } for (i = 0; i < nr_pages; ++i) { @@ -1057,6 +1066,35 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) kvfree(bgq); } +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + bool sync, int bios) +{ + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + + /* wake up the caller thread for sync decompression */ + if (sync) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use workqueue and sync decompression for atomic contexts only */ + if (in_atomic() || irqs_disabled()) { + queue_work(z_erofs_workqueue, &io->u.work); + /* enable sync decompression for readahead */ + if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; + return; + } + z_erofs_decompressqueue_work(&io->u.work); +} + static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, struct page **pagepool, @@ -1234,6 +1272,33 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } +static void z_erofs_decompressqueue_endio(struct bio *bio) +{ + tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); + struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (err) + SetPageError(page); + + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); + unlock_page(page); + } + } + z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + bio_put(bio); +} + static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, struct page **pagepool, @@ -1271,6 +1336,14 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); + /* close the main owned chain at first */ + owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (z_erofs_is_inline_pcluster(pcl)) { + move_to_bypass_jobqueue(pcl, qtail, owned_head); + continue; + } + /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { .m_pa = blknr_to_addr(pcl->obj.index), @@ -1280,10 +1353,6 @@ static void z_erofs_submit_queue(struct super_block *sb, cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; - /* close the main owned chain at first */ - owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - Z_EROFS_PCLUSTER_TAIL_CLOSED); - do { struct page *page; @@ -1435,6 +1504,7 @@ skip: static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *pagepool = NULL; int err; @@ -1450,14 +1520,13 @@ static int z_erofs_readpage(struct file *file, struct page *page) (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); - if (f.map.mpage) - put_page(f.map.mpage); - + erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); return err; } @@ -1501,10 +1570,8 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); z_erofs_runqueue(inode->i_sb, &f, &pagepool, - sbi->opt.readahead_sync_decompress && - nr_pages <= sbi->opt.max_sync_decompress_pages); - if (f.map.mpage) - put_page(f.map.mpage); + z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 4a69515dea75..e043216b545f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -62,8 +62,16 @@ struct z_erofs_pcluster { /* A: lower limit of decompressed length and if full length or not */ unsigned int length; - /* I: physical cluster size in pages */ - unsigned short pclusterpages; + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; /* I: compression algorithm format */ unsigned char algorithmformat; @@ -94,6 +102,18 @@ struct z_erofs_decompressqueue { } u; }; +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 660489a7fb64..361b1d6e4bf9 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,12 +7,17 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> +static int z_erofs_do_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, + int flags); + int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!erofs_sb_has_big_pcluster(sbi) && + !erofs_sb_has_ztailpacking(sbi) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; @@ -30,7 +35,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) struct super_block *const sb = inode->i_sb; int err, headnr; erofs_off_t pos; - struct page *page; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; void *kaddr; struct z_erofs_map_header *h; @@ -51,18 +56,18 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + !erofs_sb_has_ztailpacking(EROFS_SB(sb)) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - page = erofs_get_meta_page(sb, erofs_blknr(pos)); - if (IS_ERR(page)) { - err = PTR_ERR(page); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), + EROFS_KMAP_ATOMIC); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); goto out_unlock; } - kaddr = kmap_atomic(page); - h = kaddr + erofs_blkoff(pos); vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; @@ -94,13 +99,33 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) err = -EFSCORRUPTED; goto unmap_done; } +unmap_done: + erofs_put_metabuf(&buf); + if (err) + goto out_unlock; + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_unlock; + } /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -unmap_done: - kunmap_atomic(kaddr); - unlock_page(page); - put_page(page); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; @@ -117,37 +142,18 @@ struct z_erofs_maprecorder { u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; + erofs_off_t nextpackoff; }; static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, erofs_blk_t eblk) { struct super_block *const sb = m->inode->i_sb; - struct erofs_map_blocks *const map = m->map; - struct page *mpage = map->mpage; - - if (mpage) { - if (mpage->index == eblk) { - if (!m->kaddr) - m->kaddr = kmap_atomic(mpage); - return 0; - } - - if (m->kaddr) { - kunmap_atomic(m->kaddr); - m->kaddr = NULL; - } - put_page(mpage); - } - mpage = erofs_get_meta_page(sb, eblk); - if (IS_ERR(mpage)) { - map->mpage = NULL; - return PTR_ERR(mpage); - } - m->kaddr = kmap_atomic(mpage); - unlock_page(mpage); - map->mpage = mpage; + m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk, + EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); return 0; } @@ -169,6 +175,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, if (err) return err; + m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); m->lcn = lcn; di = m->kaddr + erofs_blkoff(pos); @@ -243,12 +250,12 @@ static int get_compacted_la_distance(unsigned int lclusterbits, static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int amortizedshift, - unsigned int eofs, bool lookahead) + erofs_off_t pos, bool lookahead) { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk; + unsigned int vcnt, base, lo, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; @@ -260,8 +267,12 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + /* it doesn't equal to round_up(..) */ + m->nextpackoff = round_down(pos, vcnt << amortizedshift) + + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + eofs = erofs_blkoff(pos); base = round_down(eofs, vcnt << amortizedshift); in = m->kaddr + base; @@ -399,8 +410,7 @@ out: err = z_erofs_reload_indexes(m, erofs_blknr(pos)); if (err) return err; - return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos), - lookahead); + return unpack_compacted_index(m, amortizedshift, pos, lookahead); } static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, @@ -583,11 +593,12 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) +static int z_erofs_do_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, + int flags) { struct erofs_inode *const vi = EROFS_I(inode); + bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -597,22 +608,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, unsigned long initial_lcn; unsigned long long ofs, end; - trace_z_erofs_map_blocks_iter_enter(inode, map, flags); - - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { - map->m_llen = map->m_la + 1 - inode->i_size; - map->m_la = inode->i_size; - map->m_flags = 0; - goto out; - } - - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; - lclusterbits = vi->z_logical_clusterbits; - ofs = map->m_la; + ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); @@ -620,6 +617,9 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; + if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) + vi->z_idataoff = m.nextpackoff; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; @@ -630,6 +630,13 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + /* + * For ztailpacking files, in order to inline data more + * effectively, special EOF lclusters are now supported + * which can have three parts at most. + */ + if (ztailpacking && end > inode->i_size) + end = inode->i_size; break; } /* m.lcn should be >= 1 if endoff < m.clusterofs */ @@ -659,11 +666,19 @@ int z_erofs_map_blocks_iter(struct inode *inode, } map->m_llen = end - map->m_la; - map->m_pa = blknr_to_addr(m.pblk); - err = z_erofs_get_extent_compressedlen(&m, initial_lcn); - if (err) - goto out; + if (flags & EROFS_GET_BLOCKS_FINDTAIL) + vi->z_tailextent_headlcn = m.lcn; + if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_META; + map->m_pa = vi->z_idataoff; + map->m_plen = vi->z_idata_size; + } else { + map->m_pa = blknr_to_addr(m.pblk); + err = z_erofs_get_extent_compressedlen(&m, initial_lcn); + if (err) + goto out; + } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; @@ -681,14 +696,38 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_flags |= EROFS_MAP_FULL_MAPPED; } unmap_out: - if (m.kaddr) - kunmap_atomic(m.kaddr); + erofs_unmap_metabuf(&m.map->buf); out: erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", __func__, map->m_la, map->m_pa, map->m_llen, map->m_plen, map->m_flags); + return err; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + int err = 0; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (map->m_la >= inode->i_size) { + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + err = z_erofs_fill_inode_lazy(inode); + if (err) + goto out; + + err = z_erofs_do_map_blocks(inode, map, flags); +out: trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ @@ -704,8 +743,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, struct erofs_map_blocks map = { .m_la = offset }; ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); - if (map.mpage) - put_page(map.mpage); + erofs_put_metabuf(&map.buf); if (ret < 0) return ret; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 06f4c5ae1451..e2daa940ebce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -307,7 +307,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -struct ctl_table epoll_table[] = { +static struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -319,6 +319,13 @@ struct ctl_table epoll_table[] = { }, { } }; + +static void __init epoll_sysctls_init(void) +{ + register_sysctl("fs/epoll", epoll_table); +} +#else +#define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; @@ -2378,6 +2385,7 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); diff --git a/fs/exec.c b/fs/exec.c index 537d92c41105..79f2c9483302 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> +#include <linux/coredump.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1045,7 +1046,7 @@ static int de_thread(struct task_struct *tsk) * Kill all other threads in the thread group. */ spin_lock_irq(lock); - if (signal_group_exit(sig)) { + if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. @@ -1054,7 +1055,7 @@ static int de_thread(struct task_struct *tsk) return -EAGAIN; } - sig->group_exit_task = tsk; + sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; @@ -1082,7 +1083,7 @@ static int de_thread(struct task_struct *tsk) write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that - * exit_notify() can't miss ->group_exit_task + * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) @@ -1149,7 +1150,7 @@ static int de_thread(struct task_struct *tsk) release_task(leader); } - sig->group_exit_task = NULL; + sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: @@ -1162,7 +1163,7 @@ no_thread_group: killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); - sig->group_exit_task = NULL; + sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; @@ -1207,7 +1208,8 @@ static int unshare_sighand(struct task_struct *me) char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); - strncpy(buf, tsk->comm, buf_size); + /* Always NUL terminated and zero-padded */ + strscpy_pad(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } @@ -1222,7 +1224,7 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); - strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } @@ -1307,6 +1309,8 @@ int begin_new_exec(struct linux_binprm * bprm) */ force_uaccess_begin(); + if (me->flags & PF_KTHREAD) + free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); @@ -2096,3 +2100,37 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, argv, envp, flags); } #endif + +#ifdef CONFIG_SYSCTL + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table fs_exec_sysctls[] = { + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +static int __init init_fs_exec_sysctls(void) +{ + register_sysctl_init("fs", fs_exec_sysctls); + return 0; +} + +fs_initcall(init_fs_exec_sysctls); +#endif /* CONFIG_SYSCTL */ diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index cc5cffc4a769..03f142307174 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -105,7 +105,7 @@ int exfat_load_bitmap(struct super_block *sb) struct exfat_dentry *ep; struct buffer_head *bh; - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index cb1c0d8c1714..a27b55ec060a 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -64,7 +64,6 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent { int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; unsigned int type, clu_offset, max_dentries; - sector_t sector; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; struct exfat_dentry *ep; @@ -115,7 +114,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent i = dentry & (dentries_per_clu - 1); for ( ; i < dentries_per_clu; i++, dentry++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, §or); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; @@ -156,7 +155,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent dir_entry->namebuf.lfnbuf_len); brelse(bh); - ep = exfat_get_dentry(sb, &clu, i + 1, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i + 1, &bh); if (!ep) return -EIO; dir_entry->size = @@ -445,7 +444,6 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct timespec64 ts = current_time(inode); - sector_t sector; struct exfat_dentry *ep; struct buffer_head *bh; @@ -453,7 +451,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, * We cannot use exfat_get_dentry_set here because file ep is not * initialized yet. */ - ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry, &bh); if (!ep) return -EIO; @@ -477,7 +475,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, exfat_update_bh(bh, IS_DIRSYNC(inode)); brelse(bh); - ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh); if (!ep) return -EIO; @@ -496,12 +494,11 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, struct super_block *sb = inode->i_sb; int ret = 0; int i, num_entries; - sector_t sector; u16 chksum; struct exfat_dentry *ep, *fep; struct buffer_head *fbh, *bh; - fep = exfat_get_dentry(sb, p_dir, entry, &fbh, §or); + fep = exfat_get_dentry(sb, p_dir, entry, &fbh); if (!fep) return -EIO; @@ -509,7 +506,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY); for (i = 1; i < num_entries; i++) { - ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL); + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh); if (!ep) { ret = -EIO; goto release_fbh; @@ -531,13 +528,12 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, { struct super_block *sb = inode->i_sb; int i; - sector_t sector; unsigned short *uniname = p_uniname->name; struct exfat_dentry *ep; struct buffer_head *bh; int sync = IS_DIRSYNC(inode); - ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry, &bh); if (!ep) return -EIO; @@ -545,7 +541,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, exfat_update_bh(bh, sync); brelse(bh); - ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh); if (!ep) return -EIO; @@ -555,7 +551,7 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, brelse(bh); for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) { - ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh); if (!ep) return -EIO; @@ -574,12 +570,11 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, { struct super_block *sb = inode->i_sb; int i; - sector_t sector; struct exfat_dentry *ep; struct buffer_head *bh; for (i = order; i < num_entries; i++) { - ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh); if (!ep) return -EIO; @@ -656,8 +651,8 @@ static int exfat_walk_fat_chain(struct super_block *sb, return 0; } -int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, - int entry, sector_t *sector, int *offset) +static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, + int entry, sector_t *sector, int *offset) { int ret; unsigned int off, clu = 0; @@ -717,8 +712,7 @@ static int exfat_dir_readahead(struct super_block *sb, sector_t sec) } struct exfat_dentry *exfat_get_dentry(struct super_block *sb, - struct exfat_chain *p_dir, int entry, struct buffer_head **bh, - sector_t *sector) + struct exfat_chain *p_dir, int entry, struct buffer_head **bh) { unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE); int off; @@ -740,8 +734,6 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb, if (!*bh) return NULL; - if (sector) - *sector = sec; return (struct exfat_dentry *)((*bh)->b_data + off); } @@ -892,7 +884,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, es->bh[es->num_bh++] = bh; } - /* validiate cached dentries */ + /* validate cached dentries */ for (i = 1; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) @@ -960,7 +952,7 @@ rewind: if (rewind && dentry == end_eidx) goto not_found; - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; @@ -1145,7 +1137,7 @@ int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, struct buffer_head *bh; for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) { - ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh, NULL); + ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh); if (!ext_ep) return -EIO; @@ -1175,7 +1167,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < dentries_per_clu; i++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; entry_type = exfat_get_entry_type(ep); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 1d6da61157c9..619e5b4bed10 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -10,7 +10,6 @@ #include <linux/ratelimit.h> #include <linux/nls.h> -#define EXFAT_SUPER_MAGIC 0x2011BAB0UL #define EXFAT_ROOT_INO 1 #define EXFAT_CLUSTERS_UNTRACKED (~0u) @@ -459,11 +458,8 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, int num_entries, unsigned int type, struct exfat_hint *hint_opt); int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); -int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, - int entry, sector_t *sector, int *offset); struct exfat_dentry *exfat_get_dentry(struct super_block *sb, - struct exfat_chain *p_dir, int entry, struct buffer_head **bh, - sector_t *sector); + struct exfat_chain *p_dir, int entry, struct buffer_head **bh); struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int num); struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index e949e563443c..a3464e56a7e1 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -84,9 +84,7 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc, static inline bool is_valid_cluster(struct exfat_sb_info *sbi, unsigned int clus) { - if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus) - return false; - return true; + return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters; } int exfat_ent_get(struct super_block *sb, unsigned int loc, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 6af0191b648f..d890fd34bb2d 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -110,8 +110,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) exfat_set_volume_dirty(sb); num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi); - num_clusters_phys = - EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi); + num_clusters_phys = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); @@ -228,12 +227,13 @@ void exfat_truncate(struct inode *inode, loff_t size) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int blocksize = i_blocksize(inode); loff_t aligned_size; int err; mutex_lock(&sbi->s_lock); - if (EXFAT_I(inode)->start_clu == 0) { + if (ei->start_clu == 0) { /* * Empty start_clu != ~0 (not allocated) */ @@ -251,8 +251,8 @@ void exfat_truncate(struct inode *inode, loff_t size) else mark_inode_dirty(inode); - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; write_size: aligned_size = i_size_read(inode); if (aligned_size & (blocksize - 1)) { @@ -260,11 +260,11 @@ write_size: aligned_size++; } - if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode)) - EXFAT_I(inode)->i_size_ondisk = aligned_size; + if (ei->i_size_ondisk > i_size_read(inode)) + ei->i_size_ondisk = aligned_size; - if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode)) - EXFAT_I(inode)->i_size_aligned = aligned_size; + if (ei->i_size_aligned > i_size_read(inode)) + ei->i_size_aligned = aligned_size; mutex_unlock(&sbi->s_lock); } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 1c7aa1ea4724..df805bd05508 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -31,7 +31,7 @@ static int __exfat_write_inode(struct inode *inode, int sync) return 0; /* - * If the indode is already unlinked, there is no need for updating it. + * If the inode is already unlinked, there is no need for updating it. */ if (ei->dir.dir == DIR_DELETED) return 0; @@ -114,10 +114,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int local_clu_offset = clu_offset; unsigned int num_to_be_allocated = 0, num_clusters = 0; - if (EXFAT_I(inode)->i_size_ondisk > 0) + if (ei->i_size_ondisk > 0) num_clusters = - EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, - sbi); + EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); if (clu_offset >= num_clusters) num_to_be_allocated = clu_offset - num_clusters + 1; @@ -416,10 +415,10 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); - if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) { + if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, "invalid size(size(%llu) > aligned(%llu)\n", - i_size_read(inode), EXFAT_I(inode)->i_size_aligned); + i_size_read(inode), ei->i_size_aligned); return -EIO; } @@ -603,8 +602,8 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index d34e6193258d..d5bd8e6d9741 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/buffer_head.h> +#include <linux/blk_types.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -180,7 +181,7 @@ int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync) set_buffer_uptodate(bhs[i]); mark_buffer_dirty(bhs[i]); if (sync) - write_dirty_buffer(bhs[i], 0); + write_dirty_buffer(bhs[i], REQ_SYNC); } for (i = 0; i < nr_bhs && sync; i++) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 24b41103d1cc..af4eb39cc0c3 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -229,7 +229,7 @@ static int exfat_search_empty_slot(struct super_block *sb, i = dentry & (dentries_per_clu - 1); for (; i < dentries_per_clu; i++, dentry++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); @@ -306,7 +306,6 @@ static int exfat_find_empty_entry(struct inode *inode, { int dentry; unsigned int ret, last_clu; - sector_t sector; loff_t size = 0; struct exfat_chain clu; struct exfat_dentry *ep = NULL; @@ -379,7 +378,7 @@ static int exfat_find_empty_entry(struct inode *inode, struct buffer_head *bh; ep = exfat_get_dentry(sb, - &(ei->dir), ei->entry + 1, &bh, §or); + &(ei->dir), ei->entry + 1, &bh); if (!ep) return -EIO; @@ -395,9 +394,9 @@ static int exfat_find_empty_entry(struct inode *inode, /* directory inode should be updated in here */ i_size_write(inode, size); - EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size; - EXFAT_I(inode)->i_size_aligned += sbi->cluster_size; - EXFAT_I(inode)->flags = p_dir->flags; + ei->i_size_ondisk += sbi->cluster_size; + ei->i_size_aligned += sbi->cluster_size; + ei->flags = p_dir->flags; inode->i_blocks += 1 << sbi->sect_per_clus_bits; } @@ -779,7 +778,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct exfat_inode_info *ei = EXFAT_I(inode); struct buffer_head *bh; - sector_t sector; int num_entries, entry, err = 0; mutex_lock(&EXFAT_SB(sb)->s_lock); @@ -791,7 +789,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) goto unlock; } - ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + ep = exfat_get_dentry(sb, &cdir, entry, &bh); if (!ep) { err = -EIO; goto unlock; @@ -895,7 +893,7 @@ static int exfat_check_dir_empty(struct super_block *sb, while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < dentries_per_clu; i++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); @@ -932,7 +930,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); struct buffer_head *bh; - sector_t sector; int num_entries, entry, err; mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); @@ -957,7 +954,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) goto unlock; } - ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + ep = exfat_get_dentry(sb, &cdir, entry, &bh); if (!ep) { err = -EIO; goto unlock; @@ -1005,13 +1002,12 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, struct exfat_inode_info *ei) { int ret, num_old_entries, num_new_entries; - sector_t sector_old, sector_new; struct exfat_dentry *epold, *epnew; struct super_block *sb = inode->i_sb; struct buffer_head *new_bh, *old_bh; int sync = IS_DIRSYNC(inode); - epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh, §or_old); + epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh); if (!epold) return -EIO; @@ -1032,8 +1028,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, if (newentry < 0) return newentry; /* -EIO or -ENOSPC */ - epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh, - §or_new); + epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh); if (!epnew) return -EIO; @@ -1046,12 +1041,10 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, brelse(old_bh); brelse(new_bh); - epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh, - §or_old); + epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh); if (!epold) return -EIO; - epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh, - §or_new); + epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh); if (!epnew) { brelse(old_bh); return -EIO; @@ -1093,12 +1086,11 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei) { int ret, newentry, num_new_entries, num_old_entries; - sector_t sector_mov, sector_new; struct exfat_dentry *epmov, *epnew; struct super_block *sb = inode->i_sb; struct buffer_head *mov_bh, *new_bh; - epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh, §or_mov); + epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh); if (!epmov) return -EIO; @@ -1116,7 +1108,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, if (newentry < 0) return newentry; /* -EIO or -ENOSPC */ - epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh, §or_new); + epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh); if (!epnew) return -EIO; @@ -1129,12 +1121,10 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, brelse(mov_bh); brelse(new_bh); - epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh, - §or_mov); + epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh); if (!epmov) return -EIO; - epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh, - §or_new); + epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh); if (!epnew) { brelse(mov_bh); return -EIO; @@ -1216,7 +1206,7 @@ static int __exfat_rename(struct inode *old_parent_inode, exfat_chain_dup(&olddir, &ei->dir); dentry = ei->entry; - ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh, NULL); + ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); if (!ep) { ret = -EIO; goto out; @@ -1237,7 +1227,7 @@ static int __exfat_rename(struct inode *old_parent_inode, p_dir = &(new_ei->dir); new_entry = new_ei->entry; - ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh); if (!ep) goto out; @@ -1277,7 +1267,7 @@ static int __exfat_rename(struct inode *old_parent_inode, if (!ret && new_inode) { /* delete entries of new_dir */ - ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh); if (!ep) { ret = -EIO; goto del_out; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 314d5407a1be..ef115e673406 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -761,7 +761,7 @@ int exfat_create_upcase_table(struct super_block *sb) while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < sbi->dentries_per_clu; i++) { - ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 5539ffc20d16..8c9fb7dcec16 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -17,6 +17,7 @@ #include <linux/iversion.h> #include <linux/nls.h> #include <linux/buffer_head.h> +#include <linux/magic.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -364,11 +365,11 @@ static int exfat_read_root(struct inode *inode) inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; - inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) - & ~(sbi->cluster_size - 1)) >> inode->i_blkbits; - EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; - EXFAT_I(inode)->i_size_aligned = i_size_read(inode); - EXFAT_I(inode)->i_size_ondisk = i_size_read(inode); + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> + inode->i_blkbits; + ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; + ei->i_size_aligned = i_size_read(inode); + ei->i_size_ondisk = i_size_read(inode); exfat_save_attr(inode, ATTR_SUBDIR); inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 3be9dd6412b7..d4f306aa5ace 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -118,6 +118,7 @@ struct ext2_sb_info { spinlock_t s_lock; struct mb_cache *s_ea_block_cache; struct dax_device *s_daxdev; + u64 s_dax_part_off; }; static inline spinlock_t * diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 333fa62661d5..602578b72d8c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> +#include <linux/dax.h> #include "ext2.h" #include "acl.h" #include "xattr.h" @@ -816,9 +817,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; - iomap->dax_dev = sbi->s_daxdev; + if (flags & IOMAP_DAX) + iomap->dax_dev = sbi->s_daxdev; + else + iomap->bdev = inode->i_sb->s_bdev; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -827,6 +830,8 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } else { iomap->type = IOMAP_MAPPED; iomap->addr = (u64)bno << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += sbi->s_dax_part_off; iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } @@ -1297,9 +1302,9 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); if (IS_DAX(inode)) { - error = iomap_zero_range(inode, newsize, - PAGE_ALIGN(newsize) - newsize, NULL, - &ext2_iomap_ops); + error = dax_zero_range(inode, newsize, + PAGE_ALIGN(newsize) - newsize, NULL, + &ext2_iomap_ops); } else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d8d580b609ba..94f1fbd7d3ac 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -802,7 +802,6 @@ static unsigned long descriptor_loc(struct super_block *sb, static int ext2_fill_super(struct super_block *sb, void *data, int silent) { - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); struct buffer_head * bh; struct ext2_sb_info * sbi; struct ext2_super_block * es; @@ -822,17 +821,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - goto failed; + return -ENOMEM; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) { kfree(sbi); - goto failed; + return -ENOMEM; } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = dax_dev; + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -946,11 +945,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (test_opt(sb, DAX)) { - if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0, - bdev_nr_sectors(sb->s_bdev))) { + if (!sbi->s_daxdev) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); clear_opt(sbi->s_mount_opt, DAX); + } else if (blocksize != PAGE_SIZE) { + ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + clear_opt(sbi->s_mount_opt, DAX); } } @@ -1199,11 +1200,10 @@ failed_mount_group_desc: failed_mount: brelse(bh); failed_sbi: + fs_put_dax(sbi->s_daxdev); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); -failed: - fs_put_dax(dax_dev); return ret; } diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 0613dfcbfd4a..57e82e25f8e2 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -139,7 +139,7 @@ fail: /* * Inode operation get_posix_acl(). * - * inode->i_mutex: don't care + * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) @@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type, bool rcu) /* * Set the access or default ACL of an inode. * - * inode->i_mutex: down unless called from ext4_new_inode + * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, @@ -246,7 +246,6 @@ retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); - ext4_fc_start_update(inode); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); @@ -264,7 +263,6 @@ retry: } out_stop: ext4_journal_stop(handle); - ext4_fc_stop_update(inode); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; @@ -273,8 +271,8 @@ out_stop: /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) + * dir->i_rwsem: down + * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 74b172a4adda..a6bb86f52b9a 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -303,7 +303,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) goto done; brelse(bh); bh = NULL; - offset = 0; } done: err = 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 404dd50856e5..bcd3b9bf8069 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,7 +1028,7 @@ struct ext4_inode_info { /* * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention + * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. @@ -1298,6 +1298,8 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 +#define EXT4_LABEL_MAX 16 + /* * Structure of the super block */ @@ -1347,7 +1349,7 @@ struct ext4_super_block { /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[16]; /* volume name */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* @@ -1661,7 +1663,7 @@ struct ext4_sb_info { struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ - atomic_t s_last_trim_minblks; + unsigned long s_last_trim_minblks; /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; @@ -1697,6 +1699,7 @@ struct ext4_sb_info { */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; + u64 s_dax_part_off; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif @@ -1725,9 +1728,9 @@ struct ext4_sb_info { */ struct work_struct s_error_work; - /* Ext4 fast commit stuff */ + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; - atomic_t s_fc_ineligible_updates; + /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. @@ -1747,7 +1750,7 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; - u64 s_fc_avg_commit_time; + tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -1793,10 +1796,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FS_ABORTED, /* Fatal error detected */ - EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ - EXT4_MF_FC_COMMITTING /* File system underoing a fast - * commit. - */ + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) @@ -2399,8 +2399,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); + BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); #if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); @@ -2484,7 +2483,7 @@ struct ext4_filename { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct fscrypt_str cf_name; #endif }; @@ -2720,7 +2719,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); @@ -2753,7 +2752,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif return err; @@ -2772,7 +2771,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif return err; @@ -2789,7 +2788,7 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif @@ -2805,7 +2804,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif @@ -2821,7 +2820,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif @@ -2925,9 +2924,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); -void ext4_fc_mark_ineligible(struct super_block *sb, int reason); -void ext4_fc_start_ineligible(struct super_block *sb, int reason); -void ext4_fc_stop_ineligible(struct super_block *sb); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); @@ -2935,6 +2932,10 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); +void ext4_fc_destroy_dentry_cache(void); +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, + int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; @@ -3096,6 +3097,9 @@ extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); +extern unsigned int ext4_list_backups(struct super_block *sb, + unsigned int *three, unsigned int *five, + unsigned int *seven); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, @@ -3110,6 +3114,8 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); +extern __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); @@ -3402,7 +3408,7 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif -/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +/* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && @@ -3413,7 +3419,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6def7339056d..3477a16d08ae 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -162,6 +162,8 @@ int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, { if (!ext4_handle_valid(handle)) return 0; + if (is_handle_aborted(handle)) + return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0e4fa644df01..db2ae4a2b38d 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -491,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking - * i_mutex for direct I/O reads. This only works for extent-based + * i_rwsem for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0ecf819bf189..c0f3f83e0c1b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -27,8 +27,8 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> -#include <linux/backing-dev.h> #include <linux/iomap.h> +#include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -97,7 +97,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. + * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); @@ -1496,8 +1496,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, - EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } @@ -2025,7 +2024,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); - eh = path[depth].p_hdr; nearex = ex; goto merge; } @@ -2054,7 +2052,6 @@ prepend: + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); - eh = path[depth].p_hdr; nearex = ex; goto merge; } @@ -4407,8 +4404,7 @@ retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) @@ -4416,8 +4412,7 @@ retry: retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; @@ -4577,7 +4572,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); /* Preallocate the range including the unaligned edges */ @@ -4647,8 +4642,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, - (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) @@ -4697,8 +4690,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - ext4_fc_start_update(inode); - if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(inode, offset, len); goto exit; @@ -4747,7 +4738,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -4762,7 +4753,6 @@ out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); exit: - ext4_fc_stop_update(inode); return ret; } @@ -5344,7 +5334,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5383,7 +5373,6 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: @@ -5485,7 +5474,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; @@ -5560,7 +5549,6 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: @@ -5583,7 +5571,7 @@ out_mutex: * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: - * i_mutex is held for both inodes + * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes @@ -6103,11 +6091,15 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 0); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + 0, path[j].p_block, 1, 1); } ext4_ext_drop_refs(path); kfree(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_fc_record_regions(inode->i_sb, inode->i_ino, + map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0f32b445582a..7964ee34e322 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -65,21 +65,11 @@ * * Fast Commit Ineligibility * ------------------------- - * Not all operations are supported by fast commits today (e.g extended - * attributes). Fast commit ineligibility is marked by calling one of the - * two following functions: - * - * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall - * back to full commit. This is useful in case of transient errors. * - * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all - * the fast commits happening between ext4_fc_start_ineligible() and - * ext4_fc_stop_ineligible() and one fast commit after the call to - * ext4_fc_stop_ineligible() to fall back to full commits. It is important to - * make one more fast commit to fall back to full commit after stop call so - * that it guaranteed that the fast commit ineligible operation contained - * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is - * followed by at least 1 full commit. + * Not all operations are supported by fast commits today (e.g extended + * attributes). Fast commit ineligibility is marked by calling + * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back + * to full commit. * * Atomicity of commits * -------------------- @@ -166,15 +156,13 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Make fast commit atomic updates more fine grained. Today, a fast commit - * eligible update must be protected within ext4_fc_start_update() and - * ext4_fc_stop_update(). These routines are called at much higher - * routines. This can be made more fine grained by combining with - * ext4_journal_start(). - * - * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() + * 1) Fast commit's commit path locks the entire file system during fast + * commit. This has significant performance penalty. Instead of that, we + * should use ext4_fc_start/stop_update functions to start inode level + * updates from ext4_journal_start/stop. Once we do that we can drop file + * system locking during commit path. * - * 3) Handle more ineligible cases. + * 2) Handle more ineligible cases. */ #include <trace/events/ext4.h> @@ -312,61 +300,37 @@ restart: } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + tid = sbi->s_journal->j_running_transaction ? + sbi->s_journal->j_running_transaction->t_tid : 0; + read_unlock(&sbi->s_journal->j_state_lock); + } + spin_lock(&sbi->s_fc_lock); + if (sbi->s_fc_ineligible_tid < tid) + sbi->s_fc_ineligible_tid = tid; + spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* - * Start a fast commit ineligible update. Any commits that happen while - * such an operation is in progress fall back to full commits. - */ -void ext4_fc_start_ineligible(struct super_block *sb, int reason) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - WARN_ON(reason >= EXT4_FC_REASON_MAX); - sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; - atomic_inc(&sbi->s_fc_ineligible_updates); -} - -/* - * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here - * to ensure that after stopping the ineligible update, at least one full - * commit takes place. - */ -void ext4_fc_stop_ineligible(struct super_block *sb) -{ - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) - return; - - ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); -} - -static inline int ext4_fc_is_ineligible(struct super_block *sb) -{ - return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || - atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); -} - -/* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full @@ -391,7 +355,7 @@ static int ext4_fc_track_template( (sbi->s_mount_state & EXT4_FC_REPLAY)) return -EOPNOTSUPP; - if (ext4_fc_is_ineligible(inode->i_sb)) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return -EINVAL; tid = handle->h_transaction->t_tid; @@ -411,7 +375,8 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -437,7 +402,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -450,7 +415,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); + EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -464,7 +429,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -552,7 +518,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } @@ -796,7 +762,6 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); - dst += dlen; return true; } @@ -930,7 +895,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { @@ -1123,6 +1087,32 @@ out: return ret; } +static void ext4_fc_update_stats(struct super_block *sb, int status, + u64 commit_time, int nblks) +{ + struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; + + jbd_debug(1, "Fast commit ended with status = %d", status); + if (status == EXT4_FC_STATUS_OK) { + stats->fc_num_commits++; + stats->fc_numblks += nblks; + if (likely(stats->s_fc_avg_commit_time)) + stats->s_fc_avg_commit_time = + (commit_time + + stats->s_fc_avg_commit_time * 3) / 4; + else + stats->s_fc_avg_commit_time = commit_time; + } else if (status == EXT4_FC_STATUS_FAILED || + status == EXT4_FC_STATUS_INELIGIBLE) { + if (status == EXT4_FC_STATUS_FAILED) + stats->fc_failed_commits++; + stats->fc_ineligible_commits++; + } else { + stats->fc_skipped_commits++; + } + trace_ext4_fc_commit_stop(sb, nblks, status); +} + /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit @@ -1135,18 +1125,15 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); - int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; + int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; trace_ext4_fc_commit_start(sb); start_time = ktime_get(); - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (ext4_fc_is_ineligible(sb))) { - reason = EXT4_FC_REASON_INELIGIBLE; - goto out; - } + if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) + return jbd2_complete_transaction(journal, commit_tid); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1155,74 +1142,59 @@ restart_fc: if (atomic_read(&sbi->s_fc_subtid) <= subtid && commit_tid > journal->j_commit_sequence) goto restart_fc; - reason = EXT4_FC_REASON_ALREADY_COMMITTED; - goto out; + ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); + return 0; } else if (ret) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_START_FAILED; - goto out; + /* + * Commit couldn't start. Just update stats and perform a + * full commit. + */ + ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); + return jbd2_complete_transaction(journal, commit_tid); + } + + /* + * After establishing journal barrier via jbd2_fc_begin_commit(), check + * if we are fast commit ineligible. + */ + if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { + status = EXT4_FC_STATUS_INELIGIBLE; + goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_FAILED; - goto out; + status = EXT4_FC_STATUS_FAILED; + goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_FC_FAILED; - goto out; + status = EXT4_FC_STATUS_FAILED; + goto fallback; } atomic_inc(&sbi->s_fc_subtid); - jbd2_fc_end_commit(journal); -out: - /* Has any ineligible update happened since we started? */ - if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { - sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; - reason = EXT4_FC_REASON_INELIGIBLE; - } - - spin_lock(&sbi->s_fc_lock); - if (reason != EXT4_FC_REASON_OK && - reason != EXT4_FC_REASON_ALREADY_COMMITTED) { - sbi->s_fc_stats.fc_ineligible_commits++; - } else { - sbi->s_fc_stats.fc_num_commits++; - sbi->s_fc_stats.fc_numblks += nblks; - } - spin_unlock(&sbi->s_fc_lock); - nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; - trace_ext4_fc_commit_stop(sb, nblks, reason); - commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ret = jbd2_fc_end_commit(journal); /* - * weight the commit time higher than the average time so we don't - * react too strongly to vast changes in the commit time + * weight the commit time higher than the average time so we + * don't react too strongly to vast changes in the commit time */ - if (likely(sbi->s_fc_avg_commit_time)) - sbi->s_fc_avg_commit_time = (commit_time + - sbi->s_fc_avg_commit_time * 3) / 4; - else - sbi->s_fc_avg_commit_time = commit_time; - jbd_debug(1, - "Fast commit ended with blks = %d, reason = %d, subtid - %d", - nblks, reason, subtid); - if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal); - if (reason == EXT4_FC_REASON_FC_START_FAILED || - reason == EXT4_FC_REASON_INELIGIBLE) - return jbd2_complete_transaction(journal, commit_tid); - return 0; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + ext4_fc_update_stats(sb, status, commit_time, nblks); + return ret; + +fallback: + ret = jbd2_fc_end_commit_fallback(journal); + ext4_fc_update_stats(sb, status, 0, 0); + return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1240,7 +1212,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); - ext4_fc_reset_inode(&iter->vfs_inode); + if (iter->i_sync_tid <= tid) + ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) @@ -1269,8 +1242,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid >= sbi->s_fc_ineligible_tid) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; @@ -1435,14 +1410,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { - state->fc_modified_inodes_size += - EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, sizeof(int) * - state->fc_modified_inodes_size, - GFP_KERNEL); + state->fc_modified_inodes, + sizeof(int) * (state->fc_modified_inodes_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); if (!state->fc_modified_inodes) return -ENOMEM; + state->fc_modified_inodes_size += + EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; @@ -1474,7 +1450,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, } inode = NULL; - ext4_fc_record_modified_inode(sb, ino); + ret = ext4_fc_record_modified_inode(sb, ino); + if (ret) + goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); @@ -1606,16 +1584,23 @@ out: } /* - * Record physical disk regions which are in use as per fast commit area. Our - * simple replay phase allocator excludes these regions from allocation. + * Record physical disk regions which are in use as per fast commit area, + * and used by inodes during replay phase. Our simple replay phase + * allocator excludes these regions from allocation. */ -static int ext4_fc_record_regions(struct super_block *sb, int ino, - ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; + /* + * during replay phase, the fc_regions_valid may not same as + * fc_regions_used, update it when do new additions. + */ + if (replay && state->fc_regions_used != state->fc_regions_valid) + state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; @@ -1633,6 +1618,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino, region->pblk = pblk; region->len = len; + if (replay) + state->fc_regions_valid++; + return 0; } @@ -1664,6 +1652,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); @@ -1681,18 +1671,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb, map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path)) { - iput(inode); - return 0; - } + if (IS_ERR(path)) + goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( @@ -1706,10 +1692,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, up_write((&EXT4_I(inode)->i_data_sem)); ext4_ext_drop_refs(path); kfree(path); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; goto next; } @@ -1722,10 +1706,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified @@ -1745,10 +1727,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); - if (ret) { - iput(inode); - return 0; - } + if (ret) + goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. @@ -1760,6 +1740,7 @@ next: } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); +out: iput(inode); return 0; } @@ -1789,6 +1770,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); + if (ret) + goto out; jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), @@ -1798,10 +1781,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) { - iput(inode); - return 0; - } + if (ret < 0) + goto out; if (ret > 0) { remaining -= ret; cur += ret; @@ -1812,16 +1793,18 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, } } - ret = ext4_punch_hole(inode, - le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits, - le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits); + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_lblk) + + le32_to_cpu(lrange.fc_len) - 1); + up_write(&EXT4_I(inode)->i_data_sem); if (ret) - jbd_debug(1, "ext4_punch_hole returned %d", ret); + goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); +out: iput(inode); - return 0; } @@ -1977,7 +1960,7 @@ static int ext4_fc_replay_scan(journal_t *journal, ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_get_actual_len(ex)); + ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; @@ -2173,7 +2156,7 @@ int ext4_fc_info_show(struct seq_file *seq, void *v) "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, - div_u64(sbi->s_fc_avg_commit_time, 1000)); + div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], @@ -2192,3 +2175,8 @@ int __init ext4_fc_init_dentry_cache(void) return 0; } + +void ext4_fc_destroy_dentry_cache(void) +{ + kmem_cache_destroy(ext4_fc_dentry_cachep); +} diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 937c381b4c85..083ad1cb705a 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -71,21 +71,19 @@ struct ext4_fc_tail { }; /* - * Fast commit reason codes + * Fast commit status codes + */ +enum { + EXT4_FC_STATUS_OK = 0, + EXT4_FC_STATUS_INELIGIBLE, + EXT4_FC_STATUS_SKIPPED, + EXT4_FC_STATUS_FAILED, +}; + +/* + * Fast commit ineligiblity reasons: */ enum { - /* - * Commit status codes: - */ - EXT4_FC_REASON_OK = 0, - EXT4_FC_REASON_INELIGIBLE, - EXT4_FC_REASON_ALREADY_COMMITTED, - EXT4_FC_REASON_FC_START_FAILED, - EXT4_FC_REASON_FC_FAILED, - - /* - * Fast commit ineligiblity reasons: - */ EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, @@ -117,7 +115,10 @@ struct ext4_fc_stats { unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; unsigned long fc_num_commits; unsigned long fc_ineligible_commits; + unsigned long fc_failed_commits; + unsigned long fc_skipped_commits; unsigned long fc_numblks; + u64 s_fc_avg_commit_time; }; #define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4c5f41052351..8cc11715518a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -259,7 +259,6 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; - ext4_fc_start_update(inode); inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) @@ -271,7 +270,6 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, out: inode_unlock(inode); - ext4_fc_stop_update(inode); if (likely(ret > 0)) { iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); @@ -552,9 +550,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - ext4_fc_start_update(inode); ret = ext4_orphan_add(handle, inode); - ext4_fc_stop_update(inode); if (ret) { ext4_journal_stop(handle); goto out; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index f34f4176c1e7..147b5241dd94 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -290,7 +290,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) const struct unicode_map *um = dir->i_sb->s_encoding; int r, dlen; unsigned char *buff; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 89efa78ed4b2..07a8c75b65ed 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. + * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 39a1ab129fdc..e42941803605 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,7 +7,7 @@ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/iversion.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -911,7 +911,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct page **pagep, void **fsdata) { - int ret, inline_size; + int ret; handle_t *handle; struct page *page; struct ext4_iloc iloc; @@ -928,14 +928,9 @@ retry_journal: goto out; } - inline_size = ext4_get_max_inline_size(inode); - - ret = -ENOSPC; - if (inline_size >= pos + len) { - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - } + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out_journal; /* * We cannot recurse into the filesystem as the transaction @@ -1133,7 +1128,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { - ext4_create_inline_data(handle, inode, inline_size); + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } @@ -1929,8 +1932,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) retry: err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bfd3545f1e5d..01c9e4f743ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,7 @@ #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> +#include <linux/dax.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -337,7 +338,7 @@ stop_handle: return; no_delete: if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -741,10 +742,11 @@ out_sem: if (ret) return ret; } - ext4_fc_track_range(handle, inode, map->m_lblk, - map->m_lblk + map->m_len - 1); } - + if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || + map->m_flags & EXT4_MAP_MAPPED)) + ext4_fc_track_range(handle, inode, map->m_lblk, + map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; @@ -1222,7 +1224,7 @@ retry_journal: /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes @@ -1844,30 +1846,16 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int bget_one(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - static int __ext4_journalled_writepage(struct page *page, unsigned int len) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; int ret = 0, err = 0; int inline_data = ext4_has_inline_data(inode); struct buffer_head *inode_bh = NULL; + loff_t size; ClearPageChecked(page); @@ -1877,14 +1865,6 @@ static int __ext4_journalled_writepage(struct page *page, inode_bh = ext4_journalled_write_inline_data(inode, len, page); if (inode_bh == NULL) goto out; - } else { - page_bufs = page_buffers(page); - if (!page_bufs) { - BUG(); - goto out; - } - ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, bget_one); } /* * We need to release the page lock before we start the @@ -1905,7 +1885,8 @@ static int __ext4_journalled_writepage(struct page *page, lock_page(page); put_page(page); - if (page->mapping != mapping) { + size = i_size_read(inode); + if (page->mapping != mapping || page_offset(page) > size) { /* The page got truncated from under us */ ext4_journal_stop(handle); ret = 0; @@ -1915,6 +1896,13 @@ static int __ext4_journalled_writepage(struct page *page, if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { + struct buffer_head *page_bufs = page_buffers(page); + + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); @@ -1935,9 +1923,6 @@ static int __ext4_journalled_writepage(struct page *page, out: unlock_page(page); out_no_pagelock: - if (!inline_data && page_bufs) - ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, - NULL, bput_one); brelse(inode_bh); return ret; } @@ -2257,7 +2242,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; - io_end_size = 0; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) @@ -2282,7 +2266,6 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; - io_end_size = 0; *map_bh = false; out: *m_lblk = lblk; @@ -3271,7 +3254,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, - loff_t length) + loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; @@ -3288,8 +3271,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + if (flags & IOMAP_DAX) + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + else + iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; @@ -3309,9 +3294,13 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; + if (flags & IOMAP_DAX) + iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3348,8 +3337,8 @@ retry: * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ - WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); - if (IS_DAX(inode)) + WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); + if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback @@ -3420,7 +3409,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; out: - ext4_set_iomap(inode, iomap, &map, offset, length); + ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } @@ -3540,7 +3529,7 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, delalloc = ext4_iomap_is_delalloc(inode, &map); set_iomap: - ext4_set_iomap(inode, iomap, &map, offset, length); + ext4_set_iomap(inode, iomap, &map, offset, length, flags); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; @@ -3780,8 +3769,8 @@ static int ext4_block_zero_page_range(handle_t *handle, length = max; if (IS_DAX(inode)) { - return iomap_zero_range(inode, from, length, NULL, - &ext4_iomap_ops); + return dax_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } @@ -3990,7 +3979,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } - /* Wait all existing dio workers, newcomers will block on i_mutex */ + /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); /* @@ -4140,7 +4129,7 @@ int ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not - * have i_mutex locked because it's not necessary. + * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); @@ -4523,7 +4512,7 @@ has_buffer: static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { - ext4_fsblk_t err_blk; + ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, @@ -4538,7 +4527,7 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode, int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { - ext4_fsblk_t err_blk; + ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, @@ -5282,7 +5271,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * transaction are already on disk (truncate waits for pages under * writeback). * - * Called with inode->i_mutex down. + * Called with inode->i_rwsem down. */ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) @@ -5320,7 +5309,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; } - ext4_fc_start_update(inode); + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; @@ -5344,7 +5333,6 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) { ext4_journal_stop(handle); - ext4_fc_stop_update(inode); return error; } /* Update corresponding info in inode so that everything is in @@ -5356,7 +5344,6 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { - ext4_fc_stop_update(inode); return error; } } @@ -5370,12 +5357,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { - ext4_fc_stop_update(inode); return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { - ext4_fc_stop_update(inode); return -EINVAL; } @@ -5427,8 +5412,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, - (oldsize > 0 ? oldsize - 1 : 0) >> - inode->i_sb->s_blocksize_bits); + EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, @@ -5499,7 +5483,6 @@ err_out: ext4_std_error(inode->i_sb, error); if (!error) error = rc; - ext4_fc_stop_update(inode); return error; } @@ -6000,7 +5983,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 606dee9e08a3..a8022c2c6a58 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,6 +27,248 @@ #include "fsmap.h" #include <trace/events/ext4.h> +typedef void ext4_update_sb_callback(struct ext4_super_block *es, + const void *arg); + +/* + * Superblock modification callback function for changing file system + * label + */ +static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +{ + /* Sanity check, this should never happen */ + BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); + + memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); +} + +static +int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, + ext4_update_sb_callback func, + const void *arg) +{ + int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh = sbi->s_sbh; + struct ext4_super_block *es = sbi->s_es; + + trace_ext4_update_sb(sb, bh->b_blocknr, 1); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + + lock_buffer(bh); + func(es, arg); + ext4_superblock_csum_set(sb); + unlock_buffer(bh); + + if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) { + ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_err; + err = sync_dirty_buffer(bh); +out_err: + ext4_std_error(sb, err); + return err; +} + +/* + * Update one backup superblock in the group 'grp' using the callback + * function 'func' and argument 'arg'. If the handle is NULL the + * modification is not journalled. + * + * Returns: 0 when no modification was done (no superblock in the group) + * 1 when the modification was successful + * <0 on error + */ +static int ext4_update_backup_sb(struct super_block *sb, + handle_t *handle, ext4_group_t grp, + ext4_update_sb_callback func, const void *arg) +{ + int err = 0; + ext4_fsblk_t sb_block; + struct buffer_head *bh; + unsigned long offset = 0; + struct ext4_super_block *es; + + if (!ext4_bg_has_super(sb, grp)) + return 0; + + /* + * For the group 0 there is always 1k padding, so we have + * either adjust offset, or sb_block depending on blocksize + */ + if (grp == 0) { + sb_block = 1 * EXT4_MIN_BLOCK_SIZE; + offset = do_div(sb_block, sb->s_blocksize); + } else { + sb_block = ext4_group_first_block_no(sb, grp); + offset = 0; + } + + trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0); + + bh = ext4_sb_bread(sb, sb_block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, + bh, + EXT4_JTR_NONE); + if (err) + goto out_bh; + } + + es = (struct ext4_super_block *) (bh->b_data + offset); + lock_buffer(bh); + if (ext4_has_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(sb, es)) { + ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " + "superblock %llu\n", sb_block); + unlock_buffer(bh); + err = -EFSBADCRC; + goto out_bh; + } + func(es, arg); + if (ext4_has_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(sb, es); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if (err) + goto out_bh; + + if (handle) { + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_bh; + } else { + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + } + err = sync_dirty_buffer(bh); + +out_bh: + brelse(bh); + ext4_std_error(sb, err); + return (err) ? err : 1; +} + +/* + * Update primary and backup superblocks using the provided function + * func and argument arg. + * + * Only the primary superblock and at most two backup superblock + * modifications are journalled; the rest is modified without journal. + * This is safe because e2fsck will re-write them if there is a problem, + * and we're very unlikely to ever need more than two backups. + */ +static +int ext4_update_superblocks_fn(struct super_block *sb, + ext4_update_sb_callback func, + const void *arg) +{ + handle_t *handle; + ext4_group_t ngroups; + unsigned int three = 1; + unsigned int five = 5; + unsigned int seven = 7; + int err = 0, ret, i; + ext4_group_t grp, primary_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * We can't update superblocks while the online resize is running + */ + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, + &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, "Can't modify superblock while" + "performing online resize"); + return -EBUSY; + } + + /* + * We're only going to update primary superblock and two + * backup superblocks in this transaction. + */ + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + /* Update primary superblock */ + err = ext4_update_primary_sb(sb, handle, func, arg); + if (err) { + ext4_msg(sb, KERN_ERR, "Failed to update primary " + "superblock"); + goto out_journal; + } + + primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr); + ngroups = ext4_get_groups_count(sb); + + /* + * Update backup superblocks. We have to start from group 0 + * because it might not be where the primary superblock is + * if the fs is mounted with -o sb=<backup_sb_block> + */ + i = 0; + grp = 0; + while (grp < ngroups) { + /* Skip primary superblock */ + if (grp == primary_grp) + goto next_grp; + + ret = ext4_update_backup_sb(sb, handle, grp, func, arg); + if (ret < 0) { + /* Ignore bad checksum; try to update next sb */ + if (ret == -EFSBADCRC) + goto next_grp; + err = ret; + goto out_journal; + } + + i += ret; + if (handle && i > 1) { + /* + * We're only journalling primary superblock and + * two backup superblocks; the rest is not + * journalled. + */ + err = ext4_journal_stop(handle); + if (err) + goto out; + handle = NULL; + } +next_grp: + grp = ext4_list_backups(sb, &three, &five, &seven); + } + +out_journal: + if (handle) { + ret = ext4_journal_stop(handle); + if (ret && !err) + err = ret; + } +out: + clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags); + smp_mb__after_atomic(); + return err ? err : 0; +} + /** * Swap memory between @a and @b for @len bytes. * @@ -169,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -252,7 +494,6 @@ revert: err_out1: ext4_journal_stop(handle); - ext4_fc_stop_ineligible(sb); ext4_double_up_write_data_sem(inode, inode_bl); err_out: @@ -743,7 +984,6 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, u32 flags = fa->flags; int err = -EOPNOTSUPP; - ext4_fc_start_update(inode); if (flags & ~EXT4_FL_USER_VISIBLE) goto out; @@ -764,7 +1004,6 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, goto out; err = ext4_ioctl_setproject(inode, fa->fsx_projid); out: - ext4_fc_stop_update(inode); return err; } @@ -850,6 +1089,64 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) return err; } +static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label) +{ + size_t len; + int ret = 0; + char new_label[EXT4_LABEL_MAX + 1]; + struct super_block *sb = file_inode(filp)->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Copy the maximum length allowed for ext4 label with one more to + * find the required terminating null byte in order to test the + * label length. The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1)) + return -EFAULT; + + len = strnlen(new_label, EXT4_LABEL_MAX + 1); + if (len > EXT4_LABEL_MAX) + return -EINVAL; + + /* + * Clear the buffer after the new label + */ + memset(new_label + len, 0, EXT4_LABEL_MAX - len); + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label); + + mnt_drop_write_file(filp); + return ret; +} + +static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) +{ + char label[EXT4_LABEL_MAX + 1]; + + /* + * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because + * FSLABEL_MAX must include terminating null byte, while s_volume_name + * does not have to. + */ + BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); + + memset(label, 0, sizeof(label)); + lock_buffer(sbi->s_sbh); + strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + unlock_buffer(sbi->s_sbh); + + if (copy_to_user(user_label, label, sizeof(label))) + return -EFAULT; + return 0; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1076,7 +1373,7 @@ mext_out: err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); @@ -1117,8 +1414,6 @@ resizefs_out: sizeof(range))) return -EFAULT; - range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; @@ -1266,6 +1561,13 @@ resizefs_out: case EXT4_IOC_CHECKPOINT: return ext4_ioctl_checkpoint(filp, arg); + case FS_IOC_GETFSLABEL: + return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg); + + case FS_IOC_SETFSLABEL: + return ext4_ioctl_setlabel(filp, + (const void __user *)arg); + default: return -ENOTTY; } @@ -1273,13 +1575,7 @@ resizefs_out: long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - long ret; - - ext4_fc_start_update(file_inode(filp)); - ret = __ext4_ioctl(filp, cmd, arg); - ext4_fc_stop_update(file_inode(filp)); - - return ret; + return __ext4_ioctl(filp, cmd, arg); } #ifdef CONFIG_COMPAT @@ -1347,6 +1643,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: case EXT4_IOC_CHECKPOINT: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 215b7068f548..67ac95c4cd9b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2834,7 +2834,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2845,7 +2845,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2857,7 +2857,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2985,7 +2985,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) __acquires(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; read_lock(&EXT4_SB(sb)->s_mb_rb_lock); @@ -2998,7 +2998,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; @@ -3010,7 +3010,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; @@ -3058,7 +3058,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) __releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } @@ -4814,7 +4814,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, - ext4_group_t group, int needed) + ext4_group_t group, int *busy) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; @@ -4822,8 +4822,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct list_head list; struct ext4_buddy e4b; int err; - int busy = 0; - int free, free_total = 0; + int free = 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) @@ -4846,19 +4845,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - if (needed == 0) - needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; - INIT_LIST_HEAD(&list); -repeat: - free = 0; ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); - busy = 1; + *busy = 1; continue; } if (pa->pa_deleted) { @@ -4898,22 +4892,13 @@ repeat: call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - free_total += free; - - /* if we still need more blocks and some PAs were used, try again */ - if (free_total < needed && busy) { - ext4_unlock_group(sb, group); - cond_resched(); - busy = 0; - goto repeat; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", - free_total, group, grp->bb_free); - return free_total; + free, group, grp->bb_free); + return free; } /* @@ -5455,13 +5440,24 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; - int freed = 0; + int freed = 0, busy = 0; + int retry = 0; trace_ext4_mb_discard_preallocations(sb, needed); + + if (needed == 0) + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; + repeat: for (i = 0; i < ngroups && needed > 0; i++) { - ret = ext4_mb_discard_group_preallocations(sb, i, needed); + ret = ext4_mb_discard_group_preallocations(sb, i, &busy); freed += ret; needed -= ret; + cond_resched(); + } + + if (needed > 0 && busy && ++retry < 3) { + busy = 0; + goto repeat; } return freed; @@ -5757,7 +5753,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct super_block *sb = ar->inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; - int i = sb->s_blocksize; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -5779,19 +5776,26 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, ext4_get_group_no_and_offset(sb, max(ext4_group_first_block_no(sb, group), goal), NULL, &blkoff); - i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize, + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + i)) { + blkoff = i + 1; + } else + break; + } brelse(bitmap_bh); - if (i >= sb->s_blocksize) - continue; - if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + i)) - continue; - break; + if (i < max) + break; } - if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize) + if (group >= ext4_get_groups_count(sb) || i >= max) { + *errp = -ENOSPC; return 0; + } block = ext4_group_first_block_no(sb, group) + i; ext4_mb_mark_bb(sb, block, 1, 1); @@ -6373,7 +6377,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) { ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); if (ret >= 0) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); @@ -6404,6 +6408,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -6422,6 +6427,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; + /* No point to try to trim less than discard granularity */ + if (range->minlen < q->limits.discard_granularity) { + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + q->limits.discard_granularity >> sb->s_blocksize_bits); + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) + goto out; + } if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) @@ -6474,7 +6486,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } if (!ret) - atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7e0b4f81c6c0..7a5353a8cfd7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -437,12 +437,12 @@ int ext4_ext_migrate(struct inode *inode) percpu_down_write(&sbi->s_writepages_rwsem); /* - * Worst case we can touch the allocation bitmaps, a bgd - * block, and a block to link in the orphan list. We do need - * need to worry about credits for modifying the quota inode. + * Worst case we can touch the allocation bitmaps and a block + * group descriptor block. We do need need to worry about + * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, - 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); @@ -459,6 +459,13 @@ int ext4_ext_migrate(struct inode *inode) ext4_journal_stop(handle); goto out_unlock; } + /* + * Use the correct seed for checksum (i.e. the seed from 'inode'). This + * is so that the metadata blocks will have the correct checksum after + * the migration. + */ + ei = EXT4_I(inode); + EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later @@ -467,7 +474,6 @@ int ext4_ext_migrate(struct inode *inode) clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); - ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* @@ -479,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode) * when we add extents we extent the journal */ /* - * Even though we take i_mutex we can still cause block + * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated @@ -492,17 +498,10 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { - /* - * It is impossible to update on-disk structures without - * a handle, so just rollback in-core changes and live other - * work to orphan_list_cleanup() - */ - ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); goto out_tmp_inode; } - ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 64a579734f93..95aa212f0863 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -632,7 +632,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { if (next_blk == EXT_MAX_BLOCKS) { - o_start = o_end; ret = -ENODATA; goto out; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122..8cf0a924a49b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1317,7 +1317,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. If quick is set, assume the name being looked up @@ -1428,7 +1428,7 @@ static bool ext4_match(struct inode *parent, f.crypto_buf = fname->crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { @@ -1800,7 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -2308,7 +2308,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; @@ -3126,7 +3126,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the @@ -3231,7 +3231,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) ext4_fc_track_unlink(handle, dentry); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the @@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, - EXT4_FC_REASON_RENAME_DIR); + EXT4_FC_REASON_RENAME_DIR, handle); } else { if (new.inode) ext4_fc_track_unlink(handle, new.dentry); @@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, - EXT4_FC_REASON_CROSS_RENAME); + EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 53adc8f570a3..7de0612eb42d 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -93,7 +93,7 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * - * Orphan list manipulation functions must be called under i_mutex unless + * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) @@ -119,7 +119,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, + * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9cb261714991..1d370364230e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,7 +24,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { - gfp_flags = GFP_NOFS; + gfp_t new_gfp_flags = GFP_NOFS; if (io->io_bio) ext4_io_submit(io); else - gfp_flags |= __GFP_NOFAIL; - congestion_wait(BLK_RW_ASYNC, HZ/50); + new_gfp_flags |= __GFP_NOFAIL; + memalloc_retry_wait(gfp_flags); + gfp_flags = new_gfp_flags; goto retry_encrypt; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3db923403505..4cd62f1d848c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -43,7 +43,6 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> -#include <linux/cleancache.h> #include "ext4.h" @@ -350,11 +349,6 @@ int ext4_mpage_readpages(struct inode *inode, } else if (fully_mapped) { SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && - !PageUptodate(page) && cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } /* * This page will go to BIO. Do we need to send this diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b63cb88ccdae..ee8f02f406cb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -717,12 +717,23 @@ out: * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... */ -static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, - unsigned *five, unsigned *seven) +unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, + unsigned int *five, unsigned int *seven) { - unsigned *min = three; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned int *min = three; int mult = 3; - unsigned ret; + unsigned int ret; + + if (ext4_has_feature_sparse_super2(sb)) { + do { + if (*min > 2) + return UINT_MAX; + ret = le32_to_cpu(es->s_backup_bgs[*min - 1]); + *min += 1; + } while (!ret); + return ret; + } if (!ext4_has_feature_sparse_super(sb)) { ret = *min; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4e33b5eca694..c5021ca0a28a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -39,7 +39,6 @@ #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> -#include <linux/cleancache.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> @@ -47,6 +46,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -73,12 +74,9 @@ static int ext4_mark_recovery_complete(struct super_block *sb, static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); -static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); @@ -86,6 +84,16 @@ static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); +static int ext4_validate_options(struct fs_context *fc); +static int ext4_check_opt_consistency(struct fs_context *fc, + struct super_block *sb); +static int ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); +static int ext4_get_tree(struct fs_context *fc); +static int ext4_reconfigure(struct fs_context *fc); +static void ext4_fc_free(struct fs_context *fc); +static int ext4_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering @@ -113,13 +121,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * transaction start -> page lock(s) -> i_data_sem (rw) */ +static const struct fs_context_operations ext4_context_ops = { + .parse_param = ext4_parse_param, + .get_tree = ext4_get_tree, + .reconfigure = ext4_reconfigure, + .free = ext4_fc_free, +}; + + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { - .owner = THIS_MODULE, - .name = "ext2", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "ext2", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); @@ -130,11 +147,12 @@ MODULE_ALIAS("ext2"); static struct file_system_type ext3_fs_type = { - .owner = THIS_MODULE, - .name = "ext3", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "ext3", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); @@ -260,8 +278,8 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -static __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); @@ -912,14 +930,20 @@ void __ext4_msg(struct super_block *sb, struct va_format vaf; va_list args; - atomic_inc(&EXT4_SB(sb)->s_msg_count); - if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) - return; + if (sb) { + atomic_inc(&EXT4_SB(sb)->s_msg_count); + if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), + "EXT4-fs")) + return; + } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + if (sb) + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + else + printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } @@ -1277,7 +1301,7 @@ static void ext4_put_super(struct super_block *sb) kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -1647,7 +1671,6 @@ static const struct super_operations ext4_sops = { .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, - .remount_fs = ext4_remount, .show_options = ext4_show_options, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, @@ -1665,7 +1688,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, @@ -1674,152 +1697,169 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, - Opt_nowarn_on_error, Opt_mblk_io_submit, - Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, + Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, + Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_debug, "debug"}, - {Opt_removed, "oldalloc"}, - {Opt_removed, "orlov"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_noload, "norecovery"}, - {Opt_noload, "noload"}, - {Opt_removed, "nobh"}, - {Opt_removed, "bh"}, - {Opt_commit, "commit=%u"}, - {Opt_min_batch_time, "min_batch_time=%u"}, - {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_dev, "journal_dev=%u"}, - {Opt_journal_path, "journal_path=%s"}, - {Opt_journal_checksum, "journal_checksum"}, - {Opt_nojournal_checksum, "nojournal_checksum"}, - {Opt_journal_async_commit, "journal_async_commit"}, - {Opt_abort, "abort"}, - {Opt_data_journal, "data=journal"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_err_abort, "data_err=abort"}, - {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_grpquota, "grpquota"}, - {Opt_noquota, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_prjquota, "prjquota"}, - {Opt_barrier, "barrier=%u"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_i_version, "i_version"}, - {Opt_dax, "dax"}, - {Opt_dax_always, "dax=always"}, - {Opt_dax_inode, "dax=inode"}, - {Opt_dax_never, "dax=never"}, - {Opt_stripe, "stripe=%u"}, - {Opt_delalloc, "delalloc"}, - {Opt_warn_on_error, "warn_on_error"}, - {Opt_nowarn_on_error, "nowarn_on_error"}, - {Opt_lazytime, "lazytime"}, - {Opt_nolazytime, "nolazytime"}, - {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, - {Opt_nodelalloc, "nodelalloc"}, - {Opt_removed, "mblk_io_submit"}, - {Opt_removed, "nomblk_io_submit"}, - {Opt_block_validity, "block_validity"}, - {Opt_noblock_validity, "noblock_validity"}, - {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, - {Opt_journal_ioprio, "journal_ioprio=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc"}, - {Opt_noauto_da_alloc, "noauto_da_alloc"}, - {Opt_dioread_nolock, "dioread_nolock"}, - {Opt_dioread_lock, "nodioread_nolock"}, - {Opt_dioread_lock, "dioread_lock"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_init_itable, "init_itable=%u"}, - {Opt_init_itable, "init_itable"}, - {Opt_noinit_itable, "noinit_itable"}, -#ifdef CONFIG_EXT4_DEBUG - {Opt_fc_debug_force, "fc_debug_force"}, - {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, -#endif - {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, - {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, - {Opt_test_dummy_encryption, "test_dummy_encryption"}, - {Opt_inlinecrypt, "inlinecrypt"}, - {Opt_nombcache, "nombcache"}, - {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ - {Opt_removed, "prefetch_block_bitmaps"}, - {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, - {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, - {Opt_removed, "check=none"}, /* mount option from ext2/3 */ - {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ - {Opt_removed, "reservation"}, /* mount option from ext2/3 */ - {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ - {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ - {Opt_err, NULL}, +static const struct constant_table ext4_param_errors[] = { + {"continue", EXT4_MOUNT_ERRORS_CONT}, + {"panic", EXT4_MOUNT_ERRORS_PANIC}, + {"remount-ro", EXT4_MOUNT_ERRORS_RO}, + {} }; -static ext4_fsblk_t get_sb_block(void **data) -{ - ext4_fsblk_t sb_block; - char *options = (char *) *data; +static const struct constant_table ext4_param_data[] = { + {"journal", EXT4_MOUNT_JOURNAL_DATA}, + {"ordered", EXT4_MOUNT_ORDERED_DATA}, + {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, + {} +}; - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ +static const struct constant_table ext4_param_data_err[] = { + {"abort", Opt_data_err_abort}, + {"ignore", Opt_data_err_ignore}, + {} +}; - options += 3; - /* TODO: use simple_strtoll with >32bit ext4 */ - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; +static const struct constant_table ext4_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; - return sb_block; -} +static const struct constant_table ext4_param_dax[] = { + {"always", Opt_dax_always}, + {"inode", Opt_dax_inode}, + {"never", Opt_dax_never}, + {} +}; + +/* String parameter that allows empty argument */ +#define fsparam_string_empty(NAME, OPT) \ + __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) + +/* + * Mount option specification + * We don't use fsparam_flag_no because of the way we set the + * options and the way we show them in _ext4_show_options(). To + * keep the changes to a minimum, let's keep the negative options + * separate for now. + */ +static const struct fs_parameter_spec ext4_param_specs[] = { + fsparam_flag ("bsddf", Opt_bsd_df), + fsparam_flag ("minixdf", Opt_minix_df), + fsparam_flag ("grpid", Opt_grpid), + fsparam_flag ("bsdgroups", Opt_grpid), + fsparam_flag ("nogrpid", Opt_nogrpid), + fsparam_flag ("sysvgroups", Opt_nogrpid), + fsparam_u32 ("resgid", Opt_resgid), + fsparam_u32 ("resuid", Opt_resuid), + fsparam_u32 ("sb", Opt_sb), + fsparam_enum ("errors", Opt_errors, ext4_param_errors), + fsparam_flag ("nouid32", Opt_nouid32), + fsparam_flag ("debug", Opt_debug), + fsparam_flag ("oldalloc", Opt_removed), + fsparam_flag ("orlov", Opt_removed), + fsparam_flag ("user_xattr", Opt_user_xattr), + fsparam_flag ("nouser_xattr", Opt_nouser_xattr), + fsparam_flag ("acl", Opt_acl), + fsparam_flag ("noacl", Opt_noacl), + fsparam_flag ("norecovery", Opt_noload), + fsparam_flag ("noload", Opt_noload), + fsparam_flag ("bh", Opt_removed), + fsparam_flag ("nobh", Opt_removed), + fsparam_u32 ("commit", Opt_commit), + fsparam_u32 ("min_batch_time", Opt_min_batch_time), + fsparam_u32 ("max_batch_time", Opt_max_batch_time), + fsparam_u32 ("journal_dev", Opt_journal_dev), + fsparam_bdev ("journal_path", Opt_journal_path), + fsparam_flag ("journal_checksum", Opt_journal_checksum), + fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), + fsparam_flag ("journal_async_commit",Opt_journal_async_commit), + fsparam_flag ("abort", Opt_abort), + fsparam_enum ("data", Opt_data, ext4_param_data), + fsparam_enum ("data_err", Opt_data_err, + ext4_param_data_err), + fsparam_string_empty + ("usrjquota", Opt_usrjquota), + fsparam_string_empty + ("grpjquota", Opt_grpjquota), + fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("noquota", Opt_noquota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("prjquota", Opt_prjquota), + fsparam_flag ("barrier", Opt_barrier), + fsparam_u32 ("barrier", Opt_barrier), + fsparam_flag ("nobarrier", Opt_nobarrier), + fsparam_flag ("i_version", Opt_i_version), + fsparam_flag ("dax", Opt_dax), + fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), + fsparam_u32 ("stripe", Opt_stripe), + fsparam_flag ("delalloc", Opt_delalloc), + fsparam_flag ("nodelalloc", Opt_nodelalloc), + fsparam_flag ("warn_on_error", Opt_warn_on_error), + fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), + fsparam_u32 ("debug_want_extra_isize", + Opt_debug_want_extra_isize), + fsparam_flag ("mblk_io_submit", Opt_removed), + fsparam_flag ("nomblk_io_submit", Opt_removed), + fsparam_flag ("block_validity", Opt_block_validity), + fsparam_flag ("noblock_validity", Opt_noblock_validity), + fsparam_u32 ("inode_readahead_blks", + Opt_inode_readahead_blks), + fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), + fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), + fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), + fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), + fsparam_flag ("dioread_nolock", Opt_dioread_nolock), + fsparam_flag ("nodioread_nolock", Opt_dioread_lock), + fsparam_flag ("dioread_lock", Opt_dioread_lock), + fsparam_flag ("discard", Opt_discard), + fsparam_flag ("nodiscard", Opt_nodiscard), + fsparam_u32 ("init_itable", Opt_init_itable), + fsparam_flag ("init_itable", Opt_init_itable), + fsparam_flag ("noinit_itable", Opt_noinit_itable), +#ifdef CONFIG_EXT4_DEBUG + fsparam_flag ("fc_debug_force", Opt_fc_debug_force), + fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), +#endif + fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), + fsparam_flag ("test_dummy_encryption", + Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", + Opt_test_dummy_encryption), + fsparam_flag ("inlinecrypt", Opt_inlinecrypt), + fsparam_flag ("nombcache", Opt_nombcache), + fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ + fsparam_flag ("prefetch_block_bitmaps", + Opt_removed), + fsparam_flag ("no_prefetch_block_bitmaps", + Opt_no_prefetch_block_bitmaps), + fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), + fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ + fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ + fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ + {} +}; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define DEFAULT_MB_OPTIMIZE_SCAN (-1) @@ -1828,90 +1868,22 @@ static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; -#ifdef CONFIG_QUOTA -static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname, *old_qname = get_qf_name(sb, sbi, qtype); - int ret = -1; - - if (sb_any_quota_loaded(sb) && !old_qname) { - ext4_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); - return -1; - } - if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_INFO, "Journaled quota options " - "ignored when QUOTA feature is enabled"); - return 1; - } - qname = match_strdup(args); - if (!qname) { - ext4_msg(sb, KERN_ERR, - "Not enough memory for storing quotafile name"); - return -1; - } - if (old_qname) { - if (strcmp(old_qname, qname) == 0) - ret = 1; - else - ext4_msg(sb, KERN_ERR, - "%s quota file already specified", - QTYPE2NAME(qtype)); - goto errout; - } - if (strchr(qname, '/')) { - ext4_msg(sb, KERN_ERR, - "quotafile must be on filesystem root"); - goto errout; - } - rcu_assign_pointer(sbi->s_qf_names[qtype], qname); - set_opt(sb, QUOTA); - return 1; -errout: - kfree(qname); - return ret; -} - -static int clear_qf_name(struct super_block *sb, int qtype) -{ - - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *old_qname = get_qf_name(sb, sbi, qtype); - - if (sb_any_quota_loaded(sb) && old_qname) { - ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" - " when quota turned on"); - return -1; - } - rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); - synchronize_rcu(); - kfree(old_qname); - return 1; -} -#endif - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 -#define MOPT_CLEAR_ERR 0x0010 -#define MOPT_GTE0 0x0020 #ifdef CONFIG_QUOTA #define MOPT_Q 0 -#define MOPT_QFMT 0x0040 +#define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif -#define MOPT_DATAJ 0x0080 -#define MOPT_NO_EXT2 0x0100 -#define MOPT_NO_EXT3 0x0200 +#define MOPT_NO_EXT2 0x0020 +#define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) -#define MOPT_STRING 0x0400 -#define MOPT_SKIP 0x0800 -#define MOPT_2 0x1000 +#define MOPT_SKIP 0x0080 +#define MOPT_2 0x0100 static const struct mount_opts { int token; @@ -1944,40 +1916,17 @@ static const struct mount_opts { EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, - {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2}, - {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2}, + {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, - {Opt_commit, 0, MOPT_GTE0}, - {Opt_max_batch_time, 0, MOPT_GTE0}, - {Opt_min_batch_time, 0, MOPT_GTE0}, - {Opt_inode_readahead_blks, 0, MOPT_GTE0}, - {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, - {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, - MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, - {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, - MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, - {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, - MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, - {Opt_stripe, 0, MOPT_GTE0}, - {Opt_resuid, 0, MOPT_GTE0}, - {Opt_resgid, 0, MOPT_GTE0}, - {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, - {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, - {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, - {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, - {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, - {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, - MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_dax_type, 0, MOPT_EXT4_ONLY}, + {Opt_journal_dev, 0, MOPT_NO_EXT2}, + {Opt_journal_path, 0, MOPT_NO_EXT2}, + {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, + {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -1989,7 +1938,6 @@ static const struct mount_opts { #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, - {Opt_debug_want_extra_isize, 0, MOPT_GTE0}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, @@ -2000,499 +1948,987 @@ static const struct mount_opts { {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, - {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, - {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, - {Opt_offusrjquota, 0, MOPT_Q}, - {Opt_offgrpjquota, 0, MOPT_Q}, - {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, - {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, - {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, - {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_STRING}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, - {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, - {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, #endif {Opt_err, 0, 0} }; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; - char *version; + unsigned int version; } ext4_sb_encoding_map[] = { - {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"}, + {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; -static int ext4_sb_read_encoding(const struct ext4_super_block *es, - const struct ext4_sb_encodings **encoding, - __u16 *flags) +static const struct ext4_sb_encodings * +ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) - break; - - if (i >= ARRAY_SIZE(ext4_sb_encoding_map)) - return -EINVAL; + return &ext4_sb_encoding_map[i]; - *encoding = &ext4_sb_encoding_map[i]; - *flags = le16_to_cpu(es->s_encoding_flags); - - return 0; + return NULL; } #endif -static int ext4_set_test_dummy_encryption(struct super_block *sb, - const char *opt, - const substring_t *arg, - bool is_remount) +static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) { #ifdef CONFIG_FS_ENCRYPTION struct ext4_sb_info *sbi = EXT4_SB(sb); int err; - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if (is_remount && !sbi->s_dummy_enc_policy.policy) { - ext4_msg(sb, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); - return -1; - } - err = fscrypt_set_test_dummy_encryption(sb, arg->from, + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_policy); if (err) { - if (err == -EEXIST) - ext4_msg(sb, KERN_WARNING, - "Can't change test_dummy_encryption on remount"); - else if (err == -EINVAL) - ext4_msg(sb, KERN_WARNING, - "Value of option \"%s\" is unrecognized", opt); - else - ext4_msg(sb, KERN_WARNING, - "Error processing option \"%s\" [%d]", - opt, err); - return -1; + ext4_msg(sb, KERN_WARNING, + "Error while setting test dummy encryption [%d]", err); + return err; } ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); #endif - return 1; + return 0; } -struct ext4_parsed_options { - unsigned long journal_devnum; - unsigned int journal_ioprio; - int mb_optimize_scan; +#define EXT4_SPEC_JQUOTA (1 << 0) +#define EXT4_SPEC_JQFMT (1 << 1) +#define EXT4_SPEC_DATAJ (1 << 2) +#define EXT4_SPEC_SB_BLOCK (1 << 3) +#define EXT4_SPEC_JOURNAL_DEV (1 << 4) +#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) +#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6) +#define EXT4_SPEC_s_want_extra_isize (1 << 7) +#define EXT4_SPEC_s_max_batch_time (1 << 8) +#define EXT4_SPEC_s_min_batch_time (1 << 9) +#define EXT4_SPEC_s_inode_readahead_blks (1 << 10) +#define EXT4_SPEC_s_li_wait_mult (1 << 11) +#define EXT4_SPEC_s_max_dir_size_kb (1 << 12) +#define EXT4_SPEC_s_stripe (1 << 13) +#define EXT4_SPEC_s_resuid (1 << 14) +#define EXT4_SPEC_s_resgid (1 << 15) +#define EXT4_SPEC_s_commit_interval (1 << 16) +#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) +#define EXT4_SPEC_s_sb_block (1 << 18) + +struct ext4_fs_context { + char *s_qf_names[EXT4_MAXQUOTAS]; + char *test_dummy_enc_arg; + int s_jquota_fmt; /* Format of quota to use */ + int mb_optimize_scan; +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + unsigned short qname_spec; + unsigned long vals_s_flags; /* Bits to set in s_flags */ + unsigned long mask_s_flags; /* Bits changed in s_flags */ + unsigned long journal_devnum; + unsigned long s_commit_interval; + unsigned long s_stripe; + unsigned int s_inode_readahead_blks; + unsigned int s_want_extra_isize; + unsigned int s_li_wait_mult; + unsigned int s_max_dir_size_kb; + unsigned int journal_ioprio; + unsigned int vals_s_mount_opt; + unsigned int mask_s_mount_opt; + unsigned int vals_s_mount_opt2; + unsigned int mask_s_mount_opt2; + unsigned int vals_s_mount_flags; + unsigned int mask_s_mount_flags; + unsigned int opt_flags; /* MOPT flags */ + unsigned int spec; + u32 s_max_batch_time; + u32 s_min_batch_time; + kuid_t s_resuid; + kgid_t s_resgid; + ext4_fsblk_t s_sb_block; }; -static int handle_mount_opt(struct super_block *sb, char *opt, int token, - substring_t *args, struct ext4_parsed_options *parsed_opts, - int is_remount) +static void ext4_fc_free(struct fs_context *fc) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_fs_context *ctx = fc->fs_private; + int i; + + if (!ctx) + return; + + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(ctx->s_qf_names[i]); + + kfree(ctx->test_dummy_enc_arg); + kfree(ctx); +} + +int ext4_init_fs_context(struct fs_context *fc) +{ + struct ext4_fs_context *ctx; + + ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &ext4_context_ops; + + return 0; +} + +#ifdef CONFIG_QUOTA +/* + * Note the name of the specified quota file. + */ +static int note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) +{ + struct ext4_fs_context *ctx = fc->fs_private; + char *qname; + + if (param->size < 1) { + ext4_msg(NULL, KERN_ERR, "Missing quota name"); + return -EINVAL; + } + if (strchr(param->string, '/')) { + ext4_msg(NULL, KERN_ERR, + "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->s_qf_names[qtype]) { + if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { + ext4_msg(NULL, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + return -EINVAL; + } + return 0; + } + + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); + if (!qname) { + ext4_msg(NULL, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -ENOMEM; + } + ctx->s_qf_names[qtype] = qname; + ctx->qname_spec |= 1 << qtype; + ctx->spec |= EXT4_SPEC_JQUOTA; + return 0; +} + +/* + * Clear the name of the specified quota file. + */ +static int unnote_qf_name(struct fs_context *fc, int qtype) +{ + struct ext4_fs_context *ctx = fc->fs_private; + + if (ctx->s_qf_names[qtype]) + kfree(ctx->s_qf_names[qtype]); + + ctx->s_qf_names[qtype] = NULL; + ctx->qname_spec |= 1 << qtype; + ctx->spec |= EXT4_SPEC_JQUOTA; + return 0; +} +#endif + +#define EXT4_SET_CTX(name) \ +static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ + unsigned long flag) \ +{ \ + ctx->mask_s_##name |= flag; \ + ctx->vals_s_##name |= flag; \ +} \ +static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ + unsigned long flag) \ +{ \ + ctx->mask_s_##name |= flag; \ + ctx->vals_s_##name &= ~flag; \ +} \ +static inline unsigned long \ +ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ +{ \ + return (ctx->vals_s_##name & flag); \ +} \ + +EXT4_SET_CTX(flags); +EXT4_SET_CTX(mount_opt); +EXT4_SET_CTX(mount_opt2); +EXT4_SET_CTX(mount_flags); + +static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; const struct mount_opts *m; + int is_remount; kuid_t uid; kgid_t gid; - int arg = 0; + int token; -#ifdef CONFIG_QUOTA - if (token == Opt_usrjquota) - return set_qf_name(sb, USRQUOTA, &args[0]); - else if (token == Opt_grpjquota) - return set_qf_name(sb, GRPQUOTA, &args[0]); - else if (token == Opt_offusrjquota) - return clear_qf_name(sb, USRQUOTA); - else if (token == Opt_offgrpjquota) - return clear_qf_name(sb, GRPQUOTA); -#endif - switch (token) { - case Opt_noacl: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); - break; - case Opt_sb: - return 1; /* handled by get_sb_block() */ - case Opt_removed: - ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); - return 1; - case Opt_abort: - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); - return 1; - case Opt_i_version: - sb->s_flags |= SB_I_VERSION; - return 1; - case Opt_lazytime: - sb->s_flags |= SB_LAZYTIME; - return 1; - case Opt_nolazytime: - sb->s_flags &= ~SB_LAZYTIME; - return 1; - case Opt_inlinecrypt: -#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - sb->s_flags |= SB_INLINECRYPT; -#else - ext4_msg(sb, KERN_ERR, "inline encryption not supported"); -#endif - return 1; - } + token = fs_parse(fc, ext4_param_specs, param, &result); + if (token < 0) + return token; + is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; - if (m->token == Opt_err) { - ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " - "or missing value", opt); - return -1; - } - - if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { - ext4_msg(sb, KERN_ERR, - "Mount option \"%s\" incompatible with ext2", opt); - return -1; - } - if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { - ext4_msg(sb, KERN_ERR, - "Mount option \"%s\" incompatible with ext3", opt); - return -1; - } + ctx->opt_flags |= m->flags; - if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) - return -1; - if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) - return -1; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { - set_opt2(sb, EXPLICIT_DELALLOC); + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { - set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM); + ctx_set_mount_opt2(ctx, + EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else - return -1; - } - if (m->flags & MOPT_CLEAR_ERR) - clear_opt(sb, ERRORS_MASK); - if (token == Opt_noquota && sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return -1; + return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { - ext4_msg(sb, KERN_ERR, "%s option not supported", opt); - } else if (token == Opt_commit) { - if (arg == 0) - arg = JBD2_DEFAULT_MAX_COMMIT_AGE; - else if (arg > INT_MAX / HZ) { - ext4_msg(sb, KERN_ERR, + ext4_msg(NULL, KERN_ERR, "%s option not supported", + param->key); + return 0; + } + + switch (token) { +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + if (!*param->string) + return unnote_qf_name(fc, USRQUOTA); + else + return note_qf_name(fc, USRQUOTA, param); + case Opt_grpjquota: + if (!*param->string) + return unnote_qf_name(fc, GRPQUOTA); + else + return note_qf_name(fc, GRPQUOTA, param); +#endif + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5"); + break; + case Opt_sb: + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + ext4_msg(NULL, KERN_WARNING, + "Ignoring %s option on remount", param->key); + } else { + ctx->s_sb_block = result.uint_32; + ctx->spec |= EXT4_SPEC_s_sb_block; + } + return 0; + case Opt_removed: + ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", + param->key); + return 0; + case Opt_abort: + ctx_set_mount_flags(ctx, EXT4_MF_FS_ABORTED); + return 0; + case Opt_i_version: + ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20"); + ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n"); + ctx_set_flags(ctx, SB_I_VERSION); + return 0; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + ctx_set_flags(ctx, SB_INLINECRYPT); +#else + ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); +#endif + return 0; + case Opt_errors: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); + ctx_set_mount_opt(ctx, result.uint_32); + return 0; +#ifdef CONFIG_QUOTA + case Opt_jqfmt: + ctx->s_jquota_fmt = result.uint_32; + ctx->spec |= EXT4_SPEC_JQFMT; + return 0; +#endif + case Opt_data: + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); + ctx_set_mount_opt(ctx, result.uint_32); + ctx->spec |= EXT4_SPEC_DATAJ; + return 0; + case Opt_commit: + if (result.uint_32 == 0) + ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE; + else if (result.uint_32 > INT_MAX / HZ) { + ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", - arg, INT_MAX / HZ); - return -1; + result.uint_32, INT_MAX / HZ); + return -EINVAL; } - sbi->s_commit_interval = HZ * arg; - } else if (token == Opt_debug_want_extra_isize) { - if ((arg & 1) || - (arg < 4) || - (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { - ext4_msg(sb, KERN_ERR, - "Invalid want_extra_isize %d", arg); - return -1; + ctx->s_commit_interval = HZ * result.uint_32; + ctx->spec |= EXT4_SPEC_s_commit_interval; + return 0; + case Opt_debug_want_extra_isize: + if ((result.uint_32 & 1) || (result.uint_32 < 4)) { + ext4_msg(NULL, KERN_ERR, + "Invalid want_extra_isize %d", result.uint_32); + return -EINVAL; } - sbi->s_want_extra_isize = arg; - } else if (token == Opt_max_batch_time) { - sbi->s_max_batch_time = arg; - } else if (token == Opt_min_batch_time) { - sbi->s_min_batch_time = arg; - } else if (token == Opt_inode_readahead_blks) { - if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { - ext4_msg(sb, KERN_ERR, + ctx->s_want_extra_isize = result.uint_32; + ctx->spec |= EXT4_SPEC_s_want_extra_isize; + return 0; + case Opt_max_batch_time: + ctx->s_max_batch_time = result.uint_32; + ctx->spec |= EXT4_SPEC_s_max_batch_time; + return 0; + case Opt_min_batch_time: + ctx->s_min_batch_time = result.uint_32; + ctx->spec |= EXT4_SPEC_s_min_batch_time; + return 0; + case Opt_inode_readahead_blks: + if (result.uint_32 && + (result.uint_32 > (1 << 30) || + !is_power_of_2(result.uint_32))) { + ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); - return -1; + return -EINVAL; } - sbi->s_inode_readahead_blks = arg; - } else if (token == Opt_init_itable) { - set_opt(sb, INIT_INODE_TABLE); - if (!args->from) - arg = EXT4_DEF_LI_WAIT_MULT; - sbi->s_li_wait_mult = arg; - } else if (token == Opt_max_dir_size_kb) { - sbi->s_max_dir_size_kb = arg; + ctx->s_inode_readahead_blks = result.uint_32; + ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; + return 0; + case Opt_init_itable: + ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); + ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (param->type == fs_value_is_string) + ctx->s_li_wait_mult = result.uint_32; + ctx->spec |= EXT4_SPEC_s_li_wait_mult; + return 0; + case Opt_max_dir_size_kb: + ctx->s_max_dir_size_kb = result.uint_32; + ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; + return 0; #ifdef CONFIG_EXT4_DEBUG - } else if (token == Opt_fc_debug_max_replay) { - sbi->s_fc_debug_max_replay = arg; + case Opt_fc_debug_max_replay: + ctx->s_fc_debug_max_replay = result.uint_32; + ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; + return 0; #endif - } else if (token == Opt_stripe) { - sbi->s_stripe = arg; - } else if (token == Opt_resuid) { - uid = make_kuid(current_user_ns(), arg); + case Opt_stripe: + ctx->s_stripe = result.uint_32; + ctx->spec |= EXT4_SPEC_s_stripe; + return 0; + case Opt_resuid: + uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) { - ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); - return -1; + ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", + result.uint_32); + return -EINVAL; } - sbi->s_resuid = uid; - } else if (token == Opt_resgid) { - gid = make_kgid(current_user_ns(), arg); + ctx->s_resuid = uid; + ctx->spec |= EXT4_SPEC_s_resuid; + return 0; + case Opt_resgid: + gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) { - ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); - return -1; + ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", + result.uint_32); + return -EINVAL; } - sbi->s_resgid = gid; - } else if (token == Opt_journal_dev) { + ctx->s_resgid = gid; + ctx->spec |= EXT4_SPEC_s_resgid; + return 0; + case Opt_journal_dev: if (is_remount) { - ext4_msg(sb, KERN_ERR, + ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); - return -1; + return -EINVAL; } - parsed_opts->journal_devnum = arg; - } else if (token == Opt_journal_path) { - char *journal_path; + ctx->journal_devnum = result.uint_32; + ctx->spec |= EXT4_SPEC_JOURNAL_DEV; + return 0; + case Opt_journal_path: + { struct inode *journal_inode; struct path path; int error; if (is_remount) { - ext4_msg(sb, KERN_ERR, + ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); - return -1; - } - journal_path = match_strdup(&args[0]); - if (!journal_path) { - ext4_msg(sb, KERN_ERR, "error: could not dup " - "journal device string"); - return -1; + return -EINVAL; } - error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + error = fs_lookup_param(fc, param, 1, &path); if (error) { - ext4_msg(sb, KERN_ERR, "error: could not find " - "journal device path: error %d", error); - kfree(journal_path); - return -1; + ext4_msg(NULL, KERN_ERR, "error: could not find " + "journal device path"); + return -EINVAL; } journal_inode = d_inode(path.dentry); - if (!S_ISBLK(journal_inode->i_mode)) { - ext4_msg(sb, KERN_ERR, "error: journal path %s " - "is not a block device", journal_path); - path_put(&path); - kfree(journal_path); - return -1; - } - - parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); + ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); + ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); - kfree(journal_path); - } else if (token == Opt_journal_ioprio) { - if (arg > 7) { - ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" + return 0; + } + case Opt_journal_ioprio: + if (result.uint_32 > 7) { + ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); - return -1; - } - parsed_opts->journal_ioprio = - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); - } else if (token == Opt_test_dummy_encryption) { - return ext4_set_test_dummy_encryption(sb, opt, &args[0], - is_remount); - } else if (m->flags & MOPT_DATAJ) { - if (is_remount) { - if (!sbi->s_journal) - ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { - ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return -1; - } - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= m->mount_opt; + return -EINVAL; } -#ifdef CONFIG_QUOTA - } else if (m->flags & MOPT_QFMT) { - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot change journaled " - "quota options when quota turned on"); - return -1; + ctx->journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); + ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; + return 0; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + if (param->type == fs_value_is_flag) { + ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; + ctx->test_dummy_enc_arg = NULL; + return 0; } - if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_INFO, - "Quota format mount options ignored " - "when QUOTA feature is enabled"); - return 1; + if (*param->string && + !(!strcmp(param->string, "v1") || + !strcmp(param->string, "v2"))) { + ext4_msg(NULL, KERN_WARNING, + "Value of option \"%s\" is unrecognized", + param->key); + return -EINVAL; } - sbi->s_jquota_fmt = m->mount_opt; + ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; + ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, + GFP_KERNEL); +#else + ext4_msg(NULL, KERN_WARNING, + "Test dummy encryption mount option ignored"); #endif - } else if (token == Opt_dax || token == Opt_dax_always || - token == Opt_dax_inode || token == Opt_dax_never) { + return 0; + case Opt_dax: + case Opt_dax_type: #ifdef CONFIG_FS_DAX - switch (token) { + { + int type = (token == Opt_dax) ? + Opt_dax : result.uint_32; + + switch (type) { case Opt_dax: case Opt_dax_always: - if (is_remount && - (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || - (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { - fail_dax_change_remount: - ext4_msg(sb, KERN_ERR, "can't change " - "dax mount option while remounting"); - return -1; - } - if (is_remount && - (test_opt(sb, DATA_FLAGS) == - EXT4_MOUNT_JOURNAL_DATA)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - return -1; - } - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: - if (is_remount && - (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || - (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) - goto fail_dax_change_remount; - sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: - if (is_remount && - ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || - (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || - !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) - goto fail_dax_change_remount; - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); + ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ - sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } + return 0; + } #else - ext4_msg(sb, KERN_INFO, "dax option not supported"); - sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; - return -1; + ext4_msg(NULL, KERN_INFO, "dax option not supported"); + return -EINVAL; #endif - } else if (token == Opt_data_err_abort) { - sbi->s_mount_opt |= m->mount_opt; - } else if (token == Opt_data_err_ignore) { - sbi->s_mount_opt &= ~m->mount_opt; - } else if (token == Opt_mb_optimize_scan) { - if (arg != 0 && arg != 1) { - ext4_msg(sb, KERN_WARNING, + case Opt_data_err: + if (result.uint_32 == Opt_data_err_abort) + ctx_set_mount_opt(ctx, m->mount_opt); + else if (result.uint_32 == Opt_data_err_ignore) + ctx_clear_mount_opt(ctx, m->mount_opt); + return 0; + case Opt_mb_optimize_scan: + if (result.int_32 != 0 && result.int_32 != 1) { + ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); - return -1; + return -EINVAL; } - parsed_opts->mb_optimize_scan = arg; - } else { - if (!args->from) - arg = 1; + ctx->mb_optimize_scan = result.int_32; + return 0; + } + + /* + * At this point we should only be getting options requiring MOPT_SET, + * or MOPT_CLEAR. Anything else is a bug + */ + if (m->token == Opt_err) { + ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", + param->key); + WARN_ON(1); + return -EINVAL; + } + + else { + unsigned int set = 0; + + if ((param->type == fs_value_is_flag) || + result.uint_32 > 0) + set = 1; + if (m->flags & MOPT_CLEAR) - arg = !arg; + set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { - ext4_msg(sb, KERN_WARNING, - "buggy handling of option %s", opt); + ext4_msg(NULL, KERN_WARNING, + "buggy handling of option %s", + param->key); WARN_ON(1); - return -1; + return -EINVAL; } if (m->flags & MOPT_2) { - if (arg != 0) - sbi->s_mount_opt2 |= m->mount_opt; + if (set != 0) + ctx_set_mount_opt2(ctx, m->mount_opt); else - sbi->s_mount_opt2 &= ~m->mount_opt; + ctx_clear_mount_opt2(ctx, m->mount_opt); } else { - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; + if (set != 0) + ctx_set_mount_opt(ctx, m->mount_opt); else - sbi->s_mount_opt &= ~m->mount_opt; + ctx_clear_mount_opt(ctx, m->mount_opt); } } - return 1; + + return 0; } -static int parse_options(char *options, struct super_block *sb, - struct ext4_parsed_options *ret_opts, - int is_remount) +static int parse_options(struct fs_context *fc, char *options) { - struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); - char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; - substring_t args[MAX_OPT_ARGS]; - int token; + struct fs_parameter param; + int ret; + char *key; if (!options) - return 1; + return 0; - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - if (handle_mount_opt(sb, p, token, args, ret_opts, - is_remount) < 0) - return 0; + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + param.type = fs_value_is_flag; + param.string = NULL; + + if (value) { + if (value == key) + continue; + + *value++ = 0; + v_len = strlen(value); + param.string = kmemdup_nul(value, v_len, + GFP_KERNEL); + if (!param.string) + return -ENOMEM; + param.type = fs_value_is_string; + } + + param.key = key; + param.size = v_len; + + ret = ext4_parse_param(fc, ¶m); + if (param.string) + kfree(param.string); + if (ret < 0) + return ret; + } + } + + ret = ext4_validate_options(fc); + if (ret < 0) + return ret; + + return 0; +} + +static int parse_apply_sb_mount_options(struct super_block *sb, + struct ext4_fs_context *m_ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *s_mount_opts = NULL; + struct ext4_fs_context *s_ctx = NULL; + struct fs_context *fc = NULL; + int ret = -ENOMEM; + + if (!sbi->s_es->s_mount_opts[0]) + return 0; + + s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + return ret; + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + goto out_free; + + s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); + if (!s_ctx) + goto out_free; + + fc->fs_private = s_ctx; + fc->s_fs_info = sbi; + + ret = parse_options(fc, s_mount_opts); + if (ret < 0) + goto parse_failed; + + ret = ext4_check_opt_consistency(fc, sb); + if (ret < 0) { +parse_failed: + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + ret = 0; + goto out_free; + } + + if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) + m_ctx->journal_devnum = s_ctx->journal_devnum; + if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) + m_ctx->journal_ioprio = s_ctx->journal_ioprio; + + ret = ext4_apply_options(fc, sb); + +out_free: + kfree(s_ctx); + kfree(fc); + kfree(s_mount_opts); + return ret; +} + +static void ext4_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + bool quota_feature = ext4_has_feature_quota(sb); + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + int i; + + if (quota_feature) + return; + + if (ctx->spec & EXT4_SPEC_JQUOTA) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (!(ctx->qname_spec & (1 << i))) + continue; + + qname = ctx->s_qf_names[i]; /* May be NULL */ + if (qname) + set_opt(sb, QUOTA); + ctx->s_qf_names[i] = NULL; + qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, + lockdep_is_held(&sb->s_umount)); + if (qname) + kfree_rcu(qname); + } } + + if (ctx->spec & EXT4_SPEC_JQFMT) + sbi->s_jquota_fmt = ctx->s_jquota_fmt; +#endif +} + +/* + * Check quota settings consistency. + */ +static int ext4_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ #ifdef CONFIG_QUOTA + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + bool quota_feature = ext4_has_feature_quota(sb); + bool quota_loaded = sb_any_quota_loaded(sb); + bool usr_qf_name, grp_qf_name, usrquota, grpquota; + int quota_flags, i; + /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ - if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { - ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && + !ext4_has_feature_project(sb)) { + ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); - return 0; + return -EINVAL; } - usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); - grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); - if (usr_qf_name || grp_qf_name) { - if (test_opt(sb, USRQUOTA) && usr_qf_name) - clear_opt(sb, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && grp_qf_name) - clear_opt(sb, GRPQUOTA); + quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; + if (quota_loaded && + ctx->mask_s_mount_opt & quota_flags && + !ctx_test_mount_opt(ctx, quota_flags)) + goto err_quota_change; - if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { - ext4_msg(sb, KERN_ERR, "old and new quota " - "format mixing"); + if (ctx->spec & EXT4_SPEC_JQUOTA) { + + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (!(ctx->qname_spec & (1 << i))) + continue; + + if (quota_loaded && + !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) + goto err_jquota_change; + + if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && + strcmp(get_qf_name(sb, sbi, i), + ctx->s_qf_names[i]) != 0) + goto err_jquota_specified; + } + + if (quota_feature) { + ext4_msg(NULL, KERN_INFO, + "Journaled quota options ignored when " + "QUOTA feature is enabled"); return 0; } + } - if (!sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "not specified"); + if (ctx->spec & EXT4_SPEC_JQFMT) { + if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) + goto err_jquota_change; + if (quota_feature) { + ext4_msg(NULL, KERN_INFO, "Quota format mount options " + "ignored when QUOTA feature is enabled"); return 0; } } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || + ctx->s_qf_names[USRQUOTA]); + grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || + ctx->s_qf_names[GRPQUOTA]); + + usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || + test_opt(sb, USRQUOTA)); + + grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || + test_opt(sb, GRPQUOTA)); + + if (usr_qf_name) { + ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); + grpquota = false; + } + + if (usr_qf_name || grp_qf_name) { + if (usrquota || grpquota) { + ext4_msg(NULL, KERN_ERR, "old and new quota " + "format mixing"); + return -EINVAL; + } + + if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { + ext4_msg(NULL, KERN_ERR, "journaled quota format " + "not specified"); + return -EINVAL; + } + } + + return 0; + +err_quota_change: + ext4_msg(NULL, KERN_ERR, + "Cannot change quota options when quota turned on"); + return -EINVAL; +err_jquota_change: + ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " + "options when quota turned on"); + return -EINVAL; +err_jquota_specified: + ext4_msg(NULL, KERN_ERR, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; +#else + return 0; #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { +} + +static int ext4_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = fc->s_fs_info; + int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + + if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(NULL, KERN_ERR, + "Mount option(s) incompatible with ext2"); + return -EINVAL; + } + if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(NULL, KERN_ERR, + "Mount option(s) incompatible with ext3"); + return -EINVAL; + } + + if (ctx->s_want_extra_isize > + (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { + ext4_msg(NULL, KERN_ERR, + "Invalid want_extra_isize %d", + ctx->s_want_extra_isize); + return -EINVAL; + } + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { int blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (blocksize < PAGE_SIZE) - ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " + ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " "experimental mount option 'dioread_nolock' " "for blocksize < PAGE_SIZE"); } + +#ifdef CONFIG_FS_ENCRYPTION + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) && + is_remount && !sbi->s_dummy_enc_policy.policy) { + ext4_msg(NULL, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } +#endif + + if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { + if (!sbi->s_journal) { + ext4_msg(NULL, KERN_WARNING, + "Remounting file system with no journal " + "so ignoring journalled data option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); + } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != + test_opt(sb, DATA_FLAGS)) { + ext4_msg(NULL, KERN_ERR, "Cannot change data mode " + "on remount"); + return -EINVAL; + } + } + + if (is_remount) { + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(NULL, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { +fail_dax_change_remount: + ext4_msg(NULL, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -EINVAL; + } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { + goto fail_dax_change_remount; + } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { + goto fail_dax_change_remount; + } + } + + return ext4_check_quota_consistency(fc, sb); +} + +static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi = fc->s_fs_info; + int ret = 0; + + sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; + sbi->s_mount_opt |= ctx->vals_s_mount_opt; + sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; + sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; + sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; + sbi->s_mount_flags |= ctx->vals_s_mount_flags; + sb->s_flags &= ~ctx->mask_s_flags; + sb->s_flags |= ctx->vals_s_flags; + + /* + * i_version differs from common mount option iversion so we have + * to let vfs know that it was set, otherwise it would get cleared + * on remount + */ + if (ctx->mask_s_flags & SB_I_VERSION) + fc->sb_flags |= SB_I_VERSION; + +#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) + APPLY(s_commit_interval); + APPLY(s_stripe); + APPLY(s_max_batch_time); + APPLY(s_min_batch_time); + APPLY(s_want_extra_isize); + APPLY(s_inode_readahead_blks); + APPLY(s_max_dir_size_kb); + APPLY(s_li_wait_mult); + APPLY(s_resgid); + APPLY(s_resuid); + +#ifdef CONFIG_EXT4_DEBUG + APPLY(s_fc_debug_max_replay); +#endif + + ext4_apply_quota_options(fc, sb); + + if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) + ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg); + + return ret; +} + + +static int ext4_validate_options(struct fs_context *fc) +{ +#ifdef CONFIG_QUOTA + struct ext4_fs_context *ctx = fc->fs_private; + char *usr_qf_name, *grp_qf_name; + + usr_qf_name = ctx->s_qf_names[USRQUOTA]; + grp_qf_name = ctx->s_qf_names[GRPQUOTA]; + + if (usr_qf_name || grp_qf_name) { + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) + ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) + ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); + + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || + ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { + ext4_msg(NULL, KERN_ERR, "old and new quota " + "format mixing"); + return -EINVAL; + } + } +#endif return 1; } @@ -2533,12 +2969,12 @@ static inline void ext4_show_quota_options(struct seq_file *seq, static const char *token2str(int token) { - const struct match_token *t; + const struct fs_parameter_spec *spec; - for (t = tokens; t->token != Opt_err; t++) - if (t->token == token && !strchr(t->pattern, '=')) + for (spec = ext4_param_specs; spec->name != NULL; spec++) + if (spec->opt == token && !spec->type) break; - return t->pattern; + return spec->name; } /* @@ -2564,7 +3000,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) + m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -2712,8 +3148,6 @@ done: EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); - - cleancache_init_fs(sb); return err; } @@ -3172,7 +3606,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " @@ -3876,21 +4310,52 @@ static void ext4_setup_csum_trigger(struct super_block *sb, sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } -static int ext4_fill_super(struct super_block *sb, void *data, int silent) +static void ext4_free_sbi(struct ext4_sb_info *sbi) +{ + if (!sbi) + return; + + kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); + kfree(sbi); +} + +static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) +{ + struct ext4_sb_info *sbi; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return NULL; + + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + + if (!sbi->s_blockgroup_lock) + goto err_out; + + sb->s_fs_info = sbi; + sbi->s_sb = sb; + return sbi; +err_out: + fs_put_dax(sbi->s_daxdev); + kfree(sbi); + return NULL; +} + +static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); - char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh, **group_desc; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **flex_groups; ext4_fsblk_t block; - ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; unsigned long def_mount_opts; struct inode *root; - const char *descr; int ret = -ENOMEM; int blocksize, clustersize; unsigned int db_count; @@ -3899,32 +4364,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) __u64 blocks_count; int err = 0; ext4_group_t first_not_zeroed; - struct ext4_parsed_options parsed_opts; + struct ext4_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ - parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - parsed_opts.journal_devnum = 0; - parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; - if ((data && !orig_data) || !sbi) - goto out_free_base; - - sbi->s_daxdev = dax_dev; - sbi->s_blockgroup_lock = - kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) - goto out_free_base; - - sb->s_fs_info = sbi; - sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; - sbi->s_sb_block = sb_block; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - /* Cleanup superblock name */ - strreplace(sb->s_id, '/', '!'); - /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); @@ -3938,10 +4388,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { - logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { - logical_sb_block = sb_block; + logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); @@ -4146,31 +4596,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - if (sbi->s_es->s_mount_opts[0]) { - char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, - sizeof(sbi->s_es->s_mount_opts), - GFP_KERNEL); - if (!s_mount_opts) - goto failed_mount; - if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - s_mount_opts); - } - kfree(s_mount_opts); - } + err = parse_apply_sb_mount_options(sb, ctx); + if (err < 0) + goto failed_mount; + sbi->s_def_mount_opt = sbi->s_mount_opt; - if (!parse_options((char *) data, sb, &parsed_opts, 0)) + + err = ext4_check_opt_consistency(fc, sb); + if (err < 0) goto failed_mount; -#ifdef CONFIG_UNICODE + err = ext4_apply_options(fc, sb); + if (err < 0) + goto failed_mount; + +#if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; - __u16 encoding_flags; + __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); - if (ext4_sb_read_encoding(es, &encoding_info, - &encoding_flags)) { + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); goto failed_mount; @@ -4179,15 +4626,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, - "can't mount with superblock charset: %s-%s " + "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", - encoding_info->name, encoding_info->version, + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), encoding_flags); goto failed_mount; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " - "%s-%s with flags 0x%hx", encoding_info->name, - encoding_info->version?:"\b", encoding_flags); + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; @@ -4299,9 +4752,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0, - bdev_nr_sectors(sb->s_bdev))) - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + if (sbi->s_daxdev) { + if (blocksize == PAGE_SIZE) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + else + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { @@ -4337,7 +4793,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { @@ -4620,14 +5076,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); - atomic_set(&sbi->s_fc_ineligible_updates, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; @@ -4653,7 +5108,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); + err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && @@ -4753,7 +5208,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; @@ -4865,9 +5320,9 @@ no_journal: * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ - if (parsed_opts.mb_optimize_scan == 1) + if (ctx->mb_optimize_scan == 1) set_opt2(sb, MB_OPTIMIZE_SCAN); - else if (parsed_opts.mb_optimize_scan == 0) + else if (ctx->mb_optimize_scan == 0) clear_opt2(sb, MB_OPTIMIZE_SCAN); else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); @@ -4969,15 +5424,6 @@ no_journal: if (err) goto failed_mount9; } - if (EXT4_SB(sb)->s_journal) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - descr = " journalled data mode"; - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - descr = " ordered data mode"; - else - descr = " writeback data mode"; - } else - descr = "out journal"; if (test_opt(sb, DISCARD)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); @@ -4987,14 +5433,6 @@ no_journal: "the device does not support discard"); } - if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %.*s%s%s. Quota mode: %s.", descr, - (int) sizeof(sbi->s_es->s_mount_opts), - sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data, - ext4_quota_mode(sb)); - if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5005,7 +5443,6 @@ no_journal: atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); - kfree(orig_data); return 0; cantfind_ext4: @@ -5077,7 +5514,7 @@ failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif @@ -5091,14 +5528,60 @@ failed_mount: ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); -out_free_base: - kfree(sbi); - kfree(orig_data); - fs_put_dax(dax_dev); return err ? err : ret; } +static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct ext4_fs_context *ctx = fc->fs_private; + struct ext4_sb_info *sbi; + const char *descr; + int ret; + + sbi = ext4_alloc_sbi(sb); + if (!sbi) + return -ENOMEM; + + fc->s_fs_info = sbi; + + /* Cleanup superblock name */ + strreplace(sb->s_id, '/', '!'); + + sbi->s_sb_block = 1; /* Default super block location */ + if (ctx->spec & EXT4_SPEC_s_sb_block) + sbi->s_sb_block = ctx->s_sb_block; + + ret = __ext4_fill_super(fc, sb); + if (ret < 0) + goto free_sbi; + + if (sbi->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Quota mode: %s.", descr, ext4_quota_mode(sb)); + + return 0; + +free_sbi: + ext4_free_sbi(sbi); + fc->s_fs_info = NULL; + return ret; +} + +static int ext4_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ext4_fill_super); +} + /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've @@ -5727,11 +6210,12 @@ struct ext4_mount_options { #endif }; -static int ext4_remount(struct super_block *sb, int *flags, char *data) +static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { + struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long old_sb_flags, vfs_flags; + unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; @@ -5740,14 +6224,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif - char *orig_data = kstrdup(data, GFP_KERNEL); - struct ext4_parsed_options parsed_opts; - parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - parsed_opts.journal_devnum = 0; - - if (data && !orig_data) - return -ENOMEM; + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; /* Store the original options */ old_sb_flags = sb->s_flags; @@ -5768,28 +6246,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); - kfree(orig_data); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) - parsed_opts.journal_ioprio = + ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; - /* - * Some options can be enabled by ext4 and/or by VFS mount flag - * either way we need to make sure it matches in both *flags and - * s_flags. Copy those selected flags from *flags to s_flags - */ - vfs_flags = SB_LAZYTIME | SB_I_VERSION; - sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); - - if (!parse_options(data, sb, &parsed_opts, 1)) { - err = -EINVAL; - goto restore_opts; - } + ext4_apply_options(fc, sb); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -5836,19 +6302,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); - set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { + if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; goto restore_opts; } - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; @@ -5996,16 +6462,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); - /* - * Some options can be enabled by ext4 and/or by VFS mount flag - * either way we need to make sure it matches in both *flags and - * s_flags. Copy those selected flags from s_flags to *flags - */ - *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); - - ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.", - orig_data, ext4_quota_mode(sb)); - kfree(orig_data); return 0; restore_opts: @@ -6031,10 +6487,30 @@ restore_opts: #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); - kfree(orig_data); return err; } +static int ext4_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + int ret; + + fc->s_fs_info = EXT4_SB(sb); + + ret = ext4_check_opt_consistency(fc, sb); + if (ret < 0) + return ret; + + ret = __ext4_remount(fc, sb); + if (ret < 0) + return ret; + + ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.", + ext4_quota_mode(sb)); + + return 0; +} + #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) @@ -6275,10 +6751,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) { - lockdep_set_quota_inode(path->dentry->d_inode, - I_DATA_SEM_NORMAL); - } else { + if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; @@ -6298,7 +6771,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); + if (err) + dquot_quota_off(sb, type); } + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); return err; } @@ -6361,8 +6839,19 @@ int ext4_enable_quotas(struct super_block *sb) "Failed to enable quota tracking " "(type=%d, err=%d). Please run " "e2fsck to fix.", type, err); - for (type--; type >= 0; type--) + for (type--; type >= 0; type--) { + struct inode *inode; + + inode = sb_dqopt(sb)->files[type]; + if (inode) + inode = igrab(inode); dquot_quota_off(sb, type); + if (inode) { + lockdep_set_quota_inode(inode, + I_DATA_SEM_NORMAL); + iput(inode); + } + } return err; } @@ -6466,7 +6955,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - if (EXT4_SB(sb)->s_journal && !handle) { + if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); @@ -6517,12 +7006,6 @@ out: } #endif -static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); -} - #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { @@ -6580,11 +7063,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) } static struct file_system_type ext4_fs_type = { - .owner = THIS_MODULE, - .name = "ext4", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .owner = THIS_MODULE, + .name = "ext4", + .init_fs_context = ext4_init_fs_context, + .parameters = ext4_param_specs, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); @@ -6649,6 +7133,7 @@ static int __init ext4_init_fs(void) out: unregister_as_ext2(); unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: @@ -6675,6 +7160,7 @@ static void __exit ext4_exit_fs(void) unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 2314f7446592..d233c24ea342 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -63,7 +63,7 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - return snprintf(buf, PAGE_SIZE, "%lu\n", + return sysfs_emit(buf, "%lu\n", (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } @@ -72,7 +72,7 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); @@ -130,8 +130,8 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) { if (!sbi->s_journal) - return snprintf(buf, PAGE_SIZE, "<none>\n"); - return snprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "<none>\n"); + return sysfs_emit(buf, "%d\n", task_pid_vnr(sbi->s_journal->j_task)); } @@ -245,6 +245,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); +EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -295,6 +296,7 @@ static struct attribute *ext4_attrs[] = { #endif ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), + ATTR_LIST(last_trim_minblks), NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -307,7 +309,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize); EXT4_ATTR_FEATURE(encryption); EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) EXT4_ATTR_FEATURE(casefold); #endif #ifdef CONFIG_FS_VERITY @@ -315,7 +317,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif @@ -327,7 +329,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif #ifdef CONFIG_FS_VERITY @@ -335,7 +337,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), #endif NULL, @@ -357,7 +359,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { - return snprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } @@ -374,7 +376,7 @@ static ssize_t ext4_attr_show(struct kobject *kobj, switch (a->attr_id) { case attr_delayed_allocation_blocks: - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (s64) EXT4_C2B(sbi, percpu_counter_sum(&sbi->s_dirtyclusters_counter))); case attr_session_write_kbytes: @@ -382,11 +384,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj, case attr_lifetime_write_kbytes: return lifetime_write_kbytes_show(sbi, buf); case attr_reserved_clusters: - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); case attr_sra_exceeded_retry_limit: - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); case attr_inode_readahead: @@ -394,42 +396,42 @@ static ssize_t ext4_attr_show(struct kobject *kobj, if (!ptr) return 0; if (a->attr_ptr == ptr_ext4_super_block_offset) - return snprintf(buf, PAGE_SIZE, "%u\n", + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); else - return snprintf(buf, PAGE_SIZE, "%u\n", + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); case attr_pointer_ul: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%lu\n", + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); case attr_pointer_u8: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%u\n", + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); case attr_pointer_u64: if (!ptr) return 0; if (a->attr_ptr == ptr_ext4_super_block_offset) - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); else - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); case attr_pointer_string: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size, + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); case attr_pointer_atomic: if (!ptr) return 0; - return snprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); case attr_feature: - return snprintf(buf, PAGE_SIZE, "supported\n"); + return sysfs_emit(buf, "supported\n"); case attr_first_error_time: return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e0fc1ed845b..042325349098 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2408,7 +2408,7 @@ retry_inode: if (IS_SYNC(inode)) ext4_handle_sync(handle); } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); @@ -2486,7 +2486,7 @@ retry: if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 7eea3cfd894d..f46a7339d6cf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,7 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP select LZ4_COMPRESS if F2FS_FS_LZ4 select LZ4_DECOMPRESS if F2FS_FS_LZ4 select LZ4HC_COMPRESS if F2FS_FS_LZ4HC diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f1693d45bb78..982f0170639f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -664,7 +664,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - err = f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; @@ -1302,8 +1302,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long flags; if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) + + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 49121a21f749..d0c3aeba5945 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -154,6 +154,7 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } @@ -620,7 +621,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; - struct page **new_cpages; u32 chksum = 0; int i, ret; @@ -635,6 +635,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { @@ -685,13 +686,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - /* Now we're going to cut unnecessary tail pages */ - new_cpages = page_array_alloc(cc->inode, new_nr_cpages); - if (!new_cpages) { - ret = -ENOMEM; - goto out_vunmap_cbuf; - } - /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - @@ -701,10 +695,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) { - new_cpages[i] = cc->cpages[i]; + if (i < new_nr_cpages) continue; - } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -712,9 +704,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); - cc->cpages = new_cpages; - cc->nr_cpages = new_nr_cpages; + cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -1296,7 +1286,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; @@ -1308,14 +1298,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - atomic_set(&cic->pending_pages, cc->nr_cpages); + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; @@ -1360,7 +1350,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; - if (i > cc->nr_cpages) { + if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -1385,8 +1375,8 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); - f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); - add_compr_block_stat(inode, cc->nr_cpages); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) @@ -1424,9 +1414,7 @@ out_unlock_op: else f2fs_unlock_op(sbi); out_free: - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -1468,25 +1456,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret; - int i = -1, err = 0; + int _submitted, compr_blocks, ret, i; compr_blocks = f2fs_compressed_blocks(cc); - if (compr_blocks < 0) { - err = compr_blocks; - goto out_err; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); } + if (compr_blocks < 0) + return compr_blocks; + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: + lock_page(cc->rpages[i]); + if (cc->rpages[i]->mapping != mapping) { +continue_unlock: unlock_page(cc->rpages[i]); continue; } - BUG_ON(!PageLocked(cc->rpages[i])); + if (!PageDirty(cc->rpages[i])) + goto continue_unlock; + + if (!clear_page_dirty_for_io(cc->rpages[i])) + goto continue_unlock; ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, @@ -1501,26 +1502,15 @@ retry_write: * avoid deadlock caused by cluster update race * from foreground operation. */ - if (IS_NOQUOTA(cc->inode)) { - err = 0; - goto out_err; - } + if (IS_NOQUOTA(cc->inode)) + return 0; ret = 0; cond_resched(); congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - lock_page(cc->rpages[i]); - - if (!PageDirty(cc->rpages[i])) { - unlock_page(cc->rpages[i]); - continue; - } - - clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } - err = ret; - goto out_err; + return ret; } *submitted += _submitted; @@ -1529,14 +1519,6 @@ retry_write: f2fs_balance_fs(F2FS_M_SB(mapping), true); return 0; -out_err: - for (++i; i < cc->cluster_size; i++) { - if (!cc->rpages[i]) - continue; - redirty_page_for_writepage(wbc, cc->rpages[i]); - unlock_page(cc->rpages[i]); - } - return err; } int f2fs_write_multi_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9f754aaef558..8c417864c66a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -8,9 +8,9 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> +#include <linux/sched/mm.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> @@ -18,9 +18,9 @@ #include <linux/swap.h> #include <linux/prefetch.h> #include <linux/uio.h> -#include <linux/cleancache.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" @@ -1354,7 +1354,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1376,61 +1376,9 @@ alloc: f2fs_invalidate_compress_page(sbi, old_blkaddr); } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); - - /* - * i_size will be updated by direct_IO. Otherwise, we'll get stale - * data from unwritten block via dio_read. - */ return 0; } -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct f2fs_map_blocks map; - int flag; - int err = 0; - bool direct_io = iocb->ki_flags & IOCB_DIRECT; - - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); - if (map.m_len > map.m_lblk) - map.m_len -= map.m_lblk; - else - map.m_len = 0; - - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = true; - - if (direct_io) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, iocb, from) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO; - goto map_blocks; - } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - if (f2fs_has_inline_data(inode)) - return err; - - flag = F2FS_GET_BLOCK_PRE_AIO; - -map_blocks: - err = f2fs_map_blocks(inode, &map, 1, flag); - if (map.m_len > 0 && err == -ENOSPC) { - if (!direct_io) - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } - return err; -} - void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { @@ -1590,8 +1538,11 @@ next_block: flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + } } if (err) goto sync_out; @@ -1786,50 +1737,6 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks) return (blks << inode->i_blkbits); } -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type, bool may_write) -{ - struct f2fs_map_blocks map; - int err; - - map.m_lblk = iblock; - map.m_len = bytes_to_blks(inode, bh->b_size); - map.m_next_pgofs = next_pgofs; - map.m_next_extent = NULL; - map.m_seg_type = seg_type; - map.m_may_create = may_write; - - err = f2fs_map_blocks(inode, &map, create, flag); - if (!err) { - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = blks_to_bytes(inode, map.m_len); - - if (map.m_multidev_dio) - bh->b_bdev = map.m_bdev; - } - return err; -} - -static int get_data_block_dio_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - true); -} - -static int get_data_block_dio(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - false); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1849,7 +1756,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -1881,7 +1788,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -2127,12 +2034,6 @@ got_it: block_nr = map->m_pblk + block_in_file - map->m_lblk; SetPageMappedToDisk(page); - if (!PageUptodate(page) && (!PageSwapCache(page) && - !cleancache_get_page(page))) { - SetPageUptodate(page); - goto confused; - } - if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; @@ -2188,12 +2089,6 @@ submit_and_realloc: ClearPageError(page); *last_block_in_bio = block_nr; goto out; -confused: - if (bio) { - __submit_bio(F2FS_I_SB(inode), bio, DATA); - bio = NULL; - } - unlock_page(page); out: *bio_ret = bio; return ret; @@ -2542,7 +2437,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } @@ -2617,6 +2512,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) @@ -2625,8 +2525,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) @@ -2738,7 +2636,7 @@ got_it: fio->need_lock = LOCK_REQ; } - err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; @@ -2987,6 +2885,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, + .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, @@ -3305,7 +3204,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct inode *inode, loff_t to) +void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); @@ -3339,12 +3238,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, int flag; /* - * we already allocated all the blocks, so we don't need to get - * the block addresses when there is no need to fill the page. + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC) && - !f2fs_verity_in_progress(inode)) + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ @@ -3595,158 +3492,6 @@ unlock_out: return copied; } -static int check_direct_IO(struct inode *inode, struct iov_iter *iter, - loff_t offset) -{ - unsigned i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned blkbits = i_blkbits; - unsigned blocksize_mask = (1 << blkbits) - 1; - unsigned long align = offset | iov_iter_alignment(iter); - struct block_device *bdev = inode->i_sb->s_bdev; - - if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) - return 1; - - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - return -EINVAL; - return 1; - } - return 0; -} - -static void f2fs_dio_end_io(struct bio *bio) -{ - struct f2fs_private_dio *dio = bio->bi_private; - - dec_page_count(F2FS_I_SB(dio->inode), - dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - bio->bi_private = dio->orig_private; - bio->bi_end_io = dio->orig_end_io; - - kfree(dio); - - bio_endio(bio); -} - -static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, - loff_t file_offset) -{ - struct f2fs_private_dio *dio; - bool write = (bio_op(bio) == REQ_OP_WRITE); - - dio = f2fs_kzalloc(F2FS_I_SB(inode), - sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) - goto out; - - dio->inode = inode; - dio->orig_end_io = bio->bi_end_io; - dio->orig_private = bio->bi_private; - dio->write = write; - - bio->bi_end_io = f2fs_dio_end_io; - bio->bi_private = dio; - - inc_page_count(F2FS_I_SB(inode), - write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - submit_bio(bio); - return; -out: - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); -} - -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - int rw = iov_iter_rw(iter); - int err; - enum rw_hint hint = iocb->ki_hint; - int whint_mode = F2FS_OPTION(sbi).whint_mode; - bool do_opu; - - err = check_direct_IO(inode, iter, offset); - if (err) - return err < 0 ? err : 0; - - if (f2fs_force_buffered_io(inode, iocb, iter)) - return 0; - - do_opu = rw == WRITE && f2fs_lfs_mode(sbi); - - trace_f2fs_direct_IO_enter(inode, offset, count, rw); - - if (rw == WRITE && whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[rw]); - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - } else { - down_read(&fi->i_gc_rwsem[rw]); - if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); - } - - err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, rw == WRITE ? get_data_block_dio_write : - get_data_block_dio, NULL, f2fs_dio_submit_bio, - rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES : - DIO_SKIP_HOLES); - - if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); - - up_read(&fi->i_gc_rwsem[rw]); - - if (rw == WRITE) { - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; - if (err > 0) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - err); - if (!do_opu) - set_inode_flag(inode, FI_UPDATE_WRITE); - } else if (err == -EIOCBQUEUED) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - count - iov_iter_count(iter)); - } else if (err < 0) { - f2fs_write_failed(inode, offset + count); - } - } else { - if (err > 0) - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); - else if (err == -EIOCBQUEUED) - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, - count - iov_iter_count(iter)); - } - -out: - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); - - return err; -} - void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { @@ -3770,12 +3515,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_page_private_gcing(page); - if (test_opt(sbi, COMPRESS_CACHE)) { - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); - } + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); @@ -3795,12 +3537,9 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct inode *inode = page->mapping->host; - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode))) clear_page_private_data(page); } @@ -4202,7 +3941,7 @@ const struct address_space_operations f2fs_dblock_aops = { .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, - .direct_IO = f2fs_direct_IO, + .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, @@ -4282,3 +4021,58 @@ void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = {}; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = bytes_to_blks(inode, offset); + map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + if (flags & IOMAP_WRITE) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, + F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = blks_to_bytes(inode, map.m_lblk); + + if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { + iomap->length = blks_to_bytes(inode, map.m_len); + if (map.m_flags & F2FS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + } else { + iomap->type = IOMAP_UNWRITTEN; + } + if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) + return -EINVAL; + + iomap->bdev = map.m_bdev; + iomap->addr = blks_to_bytes(inode, map.m_pblk); + } else { + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode->i_state & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, +}; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1820e9c106f7..166f08623362 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,7 +16,7 @@ #include "xattr.h" #include <trace/events/f2fs.h> -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -79,7 +79,7 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { @@ -174,7 +174,7 @@ void f2fs_free_filename(struct f2fs_filename *fname) kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; @@ -208,7 +208,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, return f2fs_find_target_dentry(&d, fname, max_slots); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. @@ -266,7 +266,7 @@ static inline int f2fs_match_name(const struct inode *dir, { struct fscrypt_name f; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ce9fc9f13000..68b44015514f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -28,6 +28,8 @@ #include <linux/fscrypt.h> #include <linux/fsverity.h> +struct pagevec; + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else @@ -56,6 +58,7 @@ enum { FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, + FAULT_LOCK_OP, FAULT_MAX, }; @@ -485,7 +488,7 @@ struct f2fs_filename { */ struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL * if the original name is not valid Unicode, if the directory is both @@ -654,6 +657,7 @@ enum { #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -681,6 +685,10 @@ enum { #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + #define DEF_DIR_LEVEL 0 enum { @@ -715,7 +723,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ @@ -1018,6 +1026,7 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -1486,6 +1495,7 @@ struct compress_ctx { unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ @@ -1677,6 +1687,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_urgent_high_lock; + bool gc_urgent_high_limited; /* indicates having limited trial count */ + unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ @@ -1801,13 +1814,6 @@ struct f2fs_sb_info { #endif }; -struct f2fs_private_dio { - struct inode *inode; - void *orig_private; - bio_end_io_t *orig_end_io; - bool write; -}; - #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(sbi, type) \ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ @@ -2093,6 +2099,10 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } return down_read_trylock(&sbi->cp_rwsem); } @@ -2198,6 +2208,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; @@ -2444,6 +2459,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + user_block_count = sbi->user_block_count; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) user_block_count -= sbi->unusable_block_count; @@ -3116,12 +3136,16 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { + if (is_file(inode, type)) + return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { + if (!is_file(inode, type)) + return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } @@ -3406,7 +3430,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni); + struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); @@ -3614,7 +3638,6 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); @@ -3637,6 +3660,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3650,6 +3674,7 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 92ec2699bc85..3c98ef6af97d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -24,6 +24,7 @@ #include <linux/sched/signal.h> #include <linux/fileattr.h> #include <linux/fadvise.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" @@ -1232,7 +1233,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - ret = f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (ret) { f2fs_put_dnode(&dn); return ret; @@ -1687,6 +1688,7 @@ next_alloc: map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); up_write(&sbi->pin_sem); @@ -1748,7 +1750,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - if (f2fs_compressed_file(inode) && + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -3143,17 +3149,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (f2fs_should_update_outplace(inode, NULL)) { - ret = -EINVAL; - goto out; - } - if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); f2fs_i_gc_failures_write(inode, 0); goto done; } + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (f2fs_pin_file_control(inode, false)) { ret = -EAGAIN; goto out; @@ -4218,27 +4224,385 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return __f2fs_ioctl(filp, cmd, arg); } -static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iocb, iter)) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - int ret; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - ret = generic_file_read_iter(iocb, iter); + if (f2fs_should_use_dio(inode, iocb, to)) + return f2fs_dio_read_iter(iocb, to); + ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + return ret; +} + +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t count; + int err; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + return count; +} + +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + + if (ret > 0) { + iocb->ki_pos += ret; + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + } return ret; } -static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, +}; + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const int whint_mode = F2FS_OPTION(sbi).whint_mode; + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + const enum rw_hint hint = iocb->ki_hint; + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { + up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } + } else { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + down_read(&fi->i_gc_rwsem[READ]); + } + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; + if (do_opu) + up_read(&fi->i_gc_rwsem[READ]); + up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + ret2 = filemap_write_and_wait_range(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + if (ret2 < 0) + goto out; + invalidate_mapping_pages(file->f_mapping, + bufio_start_pos >> PAGE_SHIFT, + bufio_end_pos >> PAGE_SHIFT); + } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); + return ret; +} + +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); + loff_t target_size; + bool dio; + bool may_need_sync = true; + int preallocated; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -4260,91 +4624,42 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } - if (unlikely(IS_IMMUTABLE(inode))) { - ret = -EPERM; - goto unlock; - } - - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { - ret = -EPERM; - goto unlock; - } - - ret = generic_write_checks(iocb, from); - if (ret > 0) { - bool preallocated = false; - size_t target_size = 0; - int err; - - if (fault_in_iov_iter_readable(from, iov_iter_count(from))) - set_inode_flag(inode, FI_NO_PREALLOC); - - if ((iocb->ki_flags & IOCB_NOWAIT)) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || - f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, iocb, from)) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = -EAGAIN; - goto out; - } - goto write; - } - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - goto write; - - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * Convert inline data for Direct I/O before entering - * f2fs_direct_IO(). - */ - err = f2fs_convert_inline_inode(inode); - if (err) - goto out_err; - /* - * If force_buffere_io() is true, we have to allocate - * blocks all the time, since f2fs_direct_IO will fall - * back to buffered IO. - */ - if (!f2fs_force_buffered_io(inode, iocb, from) && - f2fs_lfs_mode(F2FS_I_SB(inode))) - goto write; - } - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); + ret = f2fs_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; - err = f2fs_preallocate_blocks(iocb, from); - if (err) { -out_err: - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; - } -write: - ret = __generic_file_write_iter(iocb, from); - clear_inode_flag(inode, FI_NO_PREALLOC); + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); - /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - filemap_invalidate_lock(inode->i_mapping); - f2fs_truncate(inode); - filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - } + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); + if (preallocated < 0) + ret = preallocated; + else + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_buffered_write_iter(iocb, from); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); } -unlock: + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); +out_unlock: inode_unlock(inode); out: - trace_f2fs_file_write_iter(inode, iocb->ki_pos, - iov_iter_count(from), ret); - if (ret > 0) + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); return ret; } @@ -4352,12 +4667,12 @@ out: static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { - struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; if (advice == POSIX_FADV_SEQUENTIAL) { - inode = file_inode(filp); if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -4374,7 +4689,13 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, return 0; } - return generic_fadvise(filp, offset, len, advice); + err = generic_fadvise(filp, offset, len, advice); + if (!err && advice == POSIX_FADV_DONTNEED && + test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + + return err; } #ifdef CONFIG_COMPAT diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a946ce0ead34..ee308a8de432 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -7,7 +7,6 @@ */ #include <linux/fs.h> #include <linux/module.h> -#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> @@ -15,6 +14,7 @@ #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" @@ -92,6 +92,18 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_limited) { + if (!sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_limited = false; + spin_unlock(&sbi->gc_urgent_high_lock); + sbi->gc_mode = GC_NORMAL; + continue; + } + sbi->gc_urgent_high_remaining--; + } + spin_unlock(&sbi->gc_urgent_high_lock); + wait_ms = gc_th->urgent_sleep_time; down_write(&sbi->gc_lock); goto do_gc; @@ -947,7 +959,7 @@ next_step: continue; } - if (f2fs_get_node_info(sbi, nid, &ni)) { + if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } @@ -1015,7 +1027,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - if (f2fs_get_node_info(sbi, nid, dni)) { + if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } @@ -1026,6 +1038,9 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) + return false; + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); @@ -1039,7 +1054,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", blkaddr, source_blkaddr, segno); - f2fs_bug_on(sbi, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif @@ -1206,7 +1221,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; @@ -1375,8 +1390,7 @@ retry: if (err) { clear_page_private_gcing(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) @@ -1457,7 +1471,8 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) continue; if (!down_write_trylock( diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index e3beac546c63..3cb1e7a24740 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -105,7 +105,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) return; } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(dir)) { /* * If the casefolded name is provided, hash it instead of the diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ea08f0dfa1bd..4b5cefa3f90c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -131,7 +131,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); if (err) { f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); @@ -786,7 +786,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); if (err) goto out; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0f8b2df3e1e0..0ec8e32a00b4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -8,8 +8,8 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" @@ -516,6 +516,11 @@ make_now: } else if (ino == F2FS_COMPRESS_INO(sbi)) { #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_page only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; #endif mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); @@ -544,6 +549,14 @@ make_now: goto bad_inode; } f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -562,7 +575,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } } @@ -738,7 +751,8 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); - if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) f2fs_invalidate_compress_pages(sbi, inode->i_ino); if (inode->i_ino == F2FS_NODE_INO(sbi) || @@ -868,7 +882,7 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index cdcf54ae0db8..be599f31d3c4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -92,7 +92,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_bh(&sbi->iostat_lat_lock); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +106,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_bh(&sbi->iostat_lat_lock); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -120,9 +120,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) return; /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +133,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,16 +145,16 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_bh(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_bh(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, @@ -163,19 +163,16 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (!sbi->iostat_enable) return; - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); sbi->rw_iostat[type] += io_bytes; - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_WRITE_IO] += io_bytes; - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_READ_IO] += io_bytes; + + spin_unlock_bh(&sbi->iostat_lock); f2fs_record_iostat(sbi); } @@ -185,7 +182,6 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, { unsigned long ts_diff; unsigned int iotype = iostat_ctx->type; - unsigned long flags; struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; @@ -206,12 +202,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, idx = WRITE_ASYNC_IO; } - spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + spin_lock_bh(&sbi->iostat_lat_lock); io_lat->sum_lat[idx][iotype] += ts_diff; io_lat->bio_cnt[idx][iotype]++; if (ts_diff > io_lat->peak_lat[idx][iotype]) io_lat->peak_lat[idx][iotype] = ts_diff; - spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); + spin_unlock_bh(&sbi->iostat_lat_lock); } void iostat_update_and_unbind_ctx(struct bio *bio, int rw) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a728a0af9ce0..5f213f05556d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -561,7 +561,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -622,7 +622,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 556fcd8457f3..50b2874e758c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -8,7 +8,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/mpage.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/swap.h> @@ -430,6 +430,10 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (rwsem_is_locked(&sbi->cp_global_sem)) + return; + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; @@ -539,7 +543,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni) + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -572,9 +576,10 @@ retry: * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem)) { + if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (!down_read_trylock(&curseg->journal_rwsem)) { + } else if (rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { up_read(&nm_i->nat_tree_lock); goto retry; } @@ -887,7 +892,7 @@ static int truncate_node(struct dnode_of_data *dn) int err; pgoff_t index; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1286,7 +1291,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); if (err) { dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; @@ -1348,7 +1353,7 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - err = f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni, false); if (err) return err; @@ -1600,7 +1605,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); - if (f2fs_get_node_info(sbi, nid, &ni)) + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; if (wbc->for_reclaim) { @@ -2701,7 +2706,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - err = f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); if (err) return err; @@ -2741,7 +2746,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct page *ipage; int err; - err = f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni, false); if (err) return err; @@ -2750,7 +2755,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6a1b4668d933..79773d322c47 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -8,6 +8,7 @@ #include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" @@ -45,7 +46,7 @@ static struct kmem_cache *fsync_entry_slab; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -148,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, if (err) return err; f2fs_hash_filename(dir, fname); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* Case-sensitive match is fine for recovery */ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; @@ -587,7 +588,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_dn; } goto out; @@ -595,7 +596,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; @@ -670,8 +671,7 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_prev; } goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df9ed75f0b7a..1dabc8244083 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -9,6 +9,7 @@ #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> @@ -245,16 +246,14 @@ retry: LOOKUP_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } err = -EAGAIN; goto next; } - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; @@ -424,9 +423,7 @@ retry: err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } unlock_page(page); @@ -2558,8 +2555,8 @@ find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { - secno = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; @@ -2574,8 +2571,8 @@ find_other_zone: left_start--; continue; } - left_start = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + left_start = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 46fde9f3f28e..0291cd55cf09 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -538,7 +538,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments; + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 040b6d02e1d8..baefd398ec1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -8,9 +8,9 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/sched/mm.h> #include <linux/statfs.h> #include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/mount.h> @@ -59,6 +59,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -256,33 +257,26 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) va_end(args); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct f2fs_sb_encodings { __u16 magic; char *name; - char *version; + unsigned int version; } f2fs_sb_encoding_map[] = { - {F2FS_ENC_UTF8_12_1, "utf8", "12.1.0"}, + {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; -static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, - const struct f2fs_sb_encodings **encoding, - __u16 *flags) +static const struct f2fs_sb_encodings * +f2fs_sb_read_encoding(const struct f2fs_super_block *sb) { __u16 magic = le16_to_cpu(sb->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) if (magic == f2fs_sb_encoding_map[i].magic) - break; - - if (i >= ARRAY_SIZE(f2fs_sb_encoding_map)) - return -EINVAL; - - *encoding = &f2fs_sb_encoding_map[i]; - *flags = le16_to_cpu(sb->s_encoding_flags); + return &f2fs_sb_encoding_map[i]; - return 0; + return NULL; } struct kmem_cache *f2fs_cf_name_slab; @@ -328,6 +322,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) @@ -1225,7 +1259,7 @@ default_check: return -EINVAL; } #endif -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); @@ -1585,7 +1619,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -2415,8 +2449,7 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -3548,6 +3581,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_urgent_high_lock); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -3869,31 +3903,38 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; - if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, - &encoding_flags)) { + encoding_info = f2fs_sb_read_encoding(sbi->raw_super); + if (!encoding_info) { f2fs_err(sbi, "Encoding requested by superblock is unknown"); return -EINVAL; } + encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags); encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { f2fs_err(sbi, - "can't mount with superblock charset: %s-%s " + "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", - encoding_info->name, encoding_info->version, + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), encoding_flags); return PTR_ERR(encoding); } f2fs_info(sbi, "Using encoding defined by superblock: " - "%s-%s with flags 0x%hx", encoding_info->name, - encoding_info->version?:"\b", encoding_flags); + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); sbi->sb->s_encoding = encoding; sbi->sb->s_encoding_flags = encoding_flags; @@ -4180,6 +4221,10 @@ try_onemore: goto free_nm; } + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); @@ -4413,7 +4458,7 @@ free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); sb->s_encoding = NULL; #endif diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 7d289249cd7e..8ac506671245 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -118,6 +118,15 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -192,12 +201,11 @@ static ssize_t unusable_show(struct f2fs_attr *a, static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) - return sysfs_emit(buf, "%s (%d.%d.%d)\n", - sb->s_encoding->charset, + return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n", (sb->s_encoding->version >> 16) & 0xff, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); @@ -415,7 +423,9 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -478,6 +488,15 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { + spin_lock(&sbi->gc_urgent_high_lock); + sbi->gc_urgent_high_limited = t != 0; + sbi->gc_urgent_high_remaining = t; + spin_unlock(&sbi->gc_urgent_high_lock); + + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -733,6 +752,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -744,6 +764,7 @@ F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -757,7 +778,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption); F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -776,7 +797,7 @@ F2FS_FEATURE_RO_ATTR(lost_found); F2FS_FEATURE_RO_ATTR(verity); #endif F2FS_FEATURE_RO_ATTR(sb_checksum); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(casefold); #endif F2FS_FEATURE_RO_ATTR(readonly); @@ -812,6 +833,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), + ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -844,6 +866,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(gc_urgent_high_remaining), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), @@ -887,7 +910,7 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -906,7 +929,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif ATTR_LIST(readonly), diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index e348f33bcb2b..8e5cd9c916ff 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -226,15 +226,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index) } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, - void *last_base_addr, int index, - size_t len, const char *name) + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { if ((void *)(entry) + sizeof(__u32) > last_base_addr || - (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; return NULL; + } if (entry->e_name_index != index) continue; @@ -254,19 +257,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, unsigned int inline_size = inline_xattr_size(inode); void *max_addr = base_addr + inline_size; - list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > max_addr || - (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { - *last_addr = entry; - return NULL; - } - if (entry->e_name_index != index) - continue; - if (entry->e_name_len != len) - continue; - if (!memcmp(entry->e_name, name, len)) - break; - } + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; /* inline xattr header or entry across max inline xattr size */ if (IS_XATTR_LAST_ENTRY(entry) && @@ -368,7 +361,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, else cur_addr = txattr_addr; - *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); @@ -659,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ - here = __find_xattr(base_addr, last_base_addr, index, len, name); + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); @@ -684,8 +677,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); diff --git a/fs/fat/file.c b/fs/fat/file.c index 13855ba49cd9..a5a309fcc7fa 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -175,9 +175,10 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && - MSDOS_SB(inode->i_sb)->options.flush) { + MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); - congestion_wait(BLK_RW_ASYNC, HZ/10); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(HZ/10); } return 0; } diff --git a/fs/file_table.c b/fs/file_table.c index 45437f8e1003..7d2e692b66a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -27,13 +27,14 @@ #include <linux/task_work.h> #include <linux/ima.h> #include <linux/swap.h> +#include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ -struct files_stat_struct files_stat = { +static struct files_stat_struct files_stat = { .max_files = NR_FILE }; @@ -75,22 +76,58 @@ unsigned long get_max_files(void) } EXPORT_SYMBOL_GPL(get_max_files); +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + /* * Handle nr_files sysctl */ -#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_nr_files(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -#else -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + +static struct ctl_table fs_stat_sysctls[] = { + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { } +}; + +static int __init init_fs_stat_sysctls(void) { - return -ENOSYS; + register_sysctl_init("fs", fs_stat_sysctls); + if (IS_ENABLED(CONFIG_BINFMT_MISC)) { + struct ctl_table_header *hdr; + hdr = register_sysctl_mount_point("fs/binfmt_misc"); + kmemleak_not_leak(hdr); + } + return 0; } +fs_initcall(init_fs_stat_sysctls); #endif static struct file *__alloc_file(int flags, const struct cred *cred) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 67f0e88eed01..f8d7fe6db989 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -372,7 +372,7 @@ static bool inode_do_switch_wbs(struct inode *inode, { struct address_space *mapping = inode->i_mapping; XA_STATE(xas, &mapping->i_pages, 0); - struct page *page; + struct folio *folio; bool switched = false; spin_lock(&inode->i_lock); @@ -389,21 +389,23 @@ static bool inode_do_switch_wbs(struct inode *inode, /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points - * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under writeback. + * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to + * folios actually under writeback. */ - xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { - if (PageDirty(page)) { - dec_wb_stat(old_wb, WB_RECLAIMABLE); - inc_wb_stat(new_wb, WB_RECLAIMABLE); + xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (folio_test_dirty(folio)) { + long nr = folio_nr_pages(folio); + wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); + wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); } } xas_set(&xas, 0); - xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); + xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + long nr = folio_nr_pages(folio); + WARN_ON_ONCE(!folio_test_writeback(folio)); + wb_stat_mod(old_wb, WB_WRITEBACK, -nr); + wb_stat_mod(new_wb, WB_WRITEBACK, nr); } if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { @@ -1666,6 +1668,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; + else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + if (!(inode->i_state & I_DIRTY_PAGES)) { + inode->i_state &= ~I_PINNING_FSCACHE_WB; + wbc->unpinned_fscache_wb = true; + dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + } + } spin_unlock(&inode->i_lock); @@ -1675,6 +1684,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (ret == 0) ret = err; } + wbc->unpinned_fscache_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/fs/fs_context.c b/fs/fs_context.c index b7e43a780a62..24ce12f0db32 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -548,7 +548,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) param->key); } - if (len > PAGE_SIZE - 2 - size) + if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 3df07c0e32b3..ed40ce5742fd 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -199,6 +199,8 @@ int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p, int b; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; b = lookup_constant(bool_names, param->string, -1); if (b == -1) return fs_param_bad_value(log, param); @@ -211,8 +213,11 @@ int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int base = (unsigned long)p->data; - if (param->type != fs_value_is_string || - kstrtouint(param->string, base, &result->uint_32) < 0) + if (param->type != fs_value_is_string) + return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; + if (kstrtouint(param->string, base, &result->uint_32) < 0) return fs_param_bad_value(log, param); return 0; } @@ -221,8 +226,11 @@ EXPORT_SYMBOL(fs_param_is_u32); int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { - if (param->type != fs_value_is_string || - kstrtoint(param->string, 0, &result->int_32) < 0) + if (param->type != fs_value_is_string) + return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; + if (kstrtoint(param->string, 0, &result->int_32) < 0) return fs_param_bad_value(log, param); return 0; } @@ -231,8 +239,11 @@ EXPORT_SYMBOL(fs_param_is_s32); int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { - if (param->type != fs_value_is_string || - kstrtoull(param->string, 0, &result->uint_64) < 0) + if (param->type != fs_value_is_string) + return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; + if (kstrtoull(param->string, 0, &result->uint_64) < 0) return fs_param_bad_value(log, param); return 0; } @@ -244,6 +255,8 @@ int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p, const struct constant_table *c; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); + if (!*param->string && (p->flags & fs_param_can_be_empty)) + return 0; c = __lookup_constant(p->data, param->string); if (!c) return fs_param_bad_value(log, param); @@ -255,7 +268,8 @@ EXPORT_SYMBOL(fs_param_is_enum); int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { - if (param->type != fs_value_is_string || !*param->string) + if (param->type != fs_value_is_string || + (!*param->string && !(p->flags & fs_param_can_be_empty))) return fs_param_bad_value(log, param); return 0; } @@ -275,7 +289,8 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, { switch (param->type) { case fs_value_is_string: - if (kstrtouint(param->string, 0, &result->uint_32) < 0) + if ((!*param->string && !(p->flags & fs_param_can_be_empty)) || + kstrtouint(param->string, 0, &result->uint_32) < 0) break; if (result->uint_32 <= INT_MAX) return 0; diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index b313a978ae0a..76316c4a3fb7 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -38,3 +38,6 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_OLD_API + bool diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 03a871d689bb..afb090ea16c4 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -6,13 +6,9 @@ fscache-y := \ cache.o \ cookie.o \ - fsdef.o \ io.o \ main.o \ - netfs.o \ - object.o \ - operation.o \ - page.o + volume.o fscache-$(CONFIG_PROC_FS) += proc.o fscache-$(CONFIG_FSCACHE_STATS) += stats.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index bd4f44c1cce0..2749933852a9 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -1,209 +1,229 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache cache handling * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define FSCACHE_DEBUG_LEVEL CACHE -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include "internal.h" -LIST_HEAD(fscache_cache_list); +static LIST_HEAD(fscache_caches); DECLARE_RWSEM(fscache_addremove_sem); -DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq); -EXPORT_SYMBOL(fscache_cache_cleared_wq); +EXPORT_SYMBOL(fscache_addremove_sem); +DECLARE_WAIT_QUEUE_HEAD(fscache_clearance_waiters); +EXPORT_SYMBOL(fscache_clearance_waiters); -static LIST_HEAD(fscache_cache_tag_list); +static atomic_t fscache_cache_debug_id; /* - * look up a cache tag + * Allocate a cache cookie. */ -struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name) +static struct fscache_cache *fscache_alloc_cache(const char *name) { - struct fscache_cache_tag *tag, *xtag; - - /* firstly check for the existence of the tag under read lock */ - down_read(&fscache_addremove_sem); - - list_for_each_entry(tag, &fscache_cache_tag_list, link) { - if (strcmp(tag->name, name) == 0) { - atomic_inc(&tag->usage); - up_read(&fscache_addremove_sem); - return tag; - } - } - - up_read(&fscache_addremove_sem); - - /* the tag does not exist - create a candidate */ - xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL); - if (!xtag) - /* return a dummy tag if out of memory */ - return ERR_PTR(-ENOMEM); - - atomic_set(&xtag->usage, 1); - strcpy(xtag->name, name); - - /* write lock, search again and add if still not present */ - down_write(&fscache_addremove_sem); + struct fscache_cache *cache; - list_for_each_entry(tag, &fscache_cache_tag_list, link) { - if (strcmp(tag->name, name) == 0) { - atomic_inc(&tag->usage); - up_write(&fscache_addremove_sem); - kfree(xtag); - return tag; + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (cache) { + if (name) { + cache->name = kstrdup(name, GFP_KERNEL); + if (!cache->name) { + kfree(cache); + return NULL; + } } + refcount_set(&cache->ref, 1); + INIT_LIST_HEAD(&cache->cache_link); + cache->debug_id = atomic_inc_return(&fscache_cache_debug_id); } - - list_add_tail(&xtag->link, &fscache_cache_tag_list); - up_write(&fscache_addremove_sem); - return xtag; + return cache; } -/* - * release a reference to a cache tag - */ -void __fscache_release_cache_tag(struct fscache_cache_tag *tag) +static bool fscache_get_cache_maybe(struct fscache_cache *cache, + enum fscache_cache_trace where) { - if (tag != ERR_PTR(-ENOMEM)) { - down_write(&fscache_addremove_sem); + bool success; + int ref; - if (atomic_dec_and_test(&tag->usage)) - list_del_init(&tag->link); - else - tag = NULL; - - up_write(&fscache_addremove_sem); - - kfree(tag); - } + success = __refcount_inc_not_zero(&cache->ref, &ref); + if (success) + trace_fscache_cache(cache->debug_id, ref + 1, where); + return success; } /* - * select a cache in which to store an object - * - the cache addremove semaphore must be at least read-locked by the caller - * - the object will never be an index + * Look up a cache cookie. */ -struct fscache_cache *fscache_select_cache_for_object( - struct fscache_cookie *cookie) +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache) { - struct fscache_cache_tag *tag; - struct fscache_object *object; - struct fscache_cache *cache; + struct fscache_cache *candidate, *cache, *unnamed = NULL; - _enter(""); + /* firstly check for the existence of the cache under read lock */ + down_read(&fscache_addremove_sem); - if (list_empty(&fscache_cache_list)) { - _leave(" = NULL [no cache]"); - return NULL; + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && name && strcmp(cache->name, name) == 0 && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_r; + if (!cache->name && !name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_r; } - /* we check the parent to determine the cache to use */ - spin_lock(&cookie->lock); + if (!name) { + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_r; + } + } - /* the first in the parent's backing list should be the preferred - * cache */ - if (!hlist_empty(&cookie->backing_objects)) { - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); + up_read(&fscache_addremove_sem); - cache = object->cache; - if (fscache_object_is_dying(object) || - test_bit(FSCACHE_IOERROR, &cache->flags)) - cache = NULL; + /* the cache does not exist - create a candidate */ + candidate = fscache_alloc_cache(name); + if (!candidate) + return ERR_PTR(-ENOMEM); - spin_unlock(&cookie->lock); - _leave(" = %s [parent]", cache ? cache->tag->name : "NULL"); - return cache; - } + /* write lock, search again and add if still not present */ + down_write(&fscache_addremove_sem); - /* the parent is unbacked */ - if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { - /* cookie not an index and is unbacked */ - spin_unlock(&cookie->lock); - _leave(" = NULL [cookie ub,ni]"); - return NULL; + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && name && strcmp(cache->name, name) == 0 && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_w; + if (!cache->name) { + unnamed = cache; + if (!name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_w; + } } - spin_unlock(&cookie->lock); + if (unnamed && is_cache && + fscache_get_cache_maybe(unnamed, fscache_cache_get_acquire)) + goto use_unnamed_cache; - if (!cookie->def->select_cache) - goto no_preference; + if (!name) { + list_for_each_entry(cache, &fscache_caches, cache_link) { + if (cache->name && + fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) + goto got_cache_w; + } + } - /* ask the netfs for its preference */ - tag = cookie->def->select_cache(cookie->parent->netfs_data, - cookie->netfs_data); - if (!tag) - goto no_preference; + list_add_tail(&candidate->cache_link, &fscache_caches); + trace_fscache_cache(candidate->debug_id, + refcount_read(&candidate->ref), + fscache_cache_new_acquire); + up_write(&fscache_addremove_sem); + return candidate; - if (tag == ERR_PTR(-ENOMEM)) { - _leave(" = NULL [nomem tag]"); - return NULL; - } +got_cache_r: + up_read(&fscache_addremove_sem); + return cache; +use_unnamed_cache: + cache = unnamed; + cache->name = candidate->name; + candidate->name = NULL; +got_cache_w: + up_write(&fscache_addremove_sem); + kfree(candidate->name); + kfree(candidate); + return cache; +} - if (!tag->cache) { - _leave(" = NULL [unbacked tag]"); - return NULL; - } +/** + * fscache_acquire_cache - Acquire a cache-level cookie. + * @name: The name of the cache. + * + * Get a cookie to represent an actual cache. If a name is given and there is + * a nameless cache record available, this will acquire that and set its name, + * directing all the volumes using it to this cache. + * + * The cache will be switched over to the preparing state if not currently in + * use, otherwise -EBUSY will be returned. + */ +struct fscache_cache *fscache_acquire_cache(const char *name) +{ + struct fscache_cache *cache; - if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) - return NULL; + ASSERT(name); + cache = fscache_lookup_cache(name, true); + if (IS_ERR(cache)) + return cache; - _leave(" = %s [specific]", tag->name); - return tag->cache; + if (!fscache_set_cache_state_maybe(cache, + FSCACHE_CACHE_IS_NOT_PRESENT, + FSCACHE_CACHE_IS_PREPARING)) { + pr_warn("Cache tag %s in use\n", name); + fscache_put_cache(cache, fscache_cache_put_cache); + return ERR_PTR(-EBUSY); + } -no_preference: - /* netfs has no preference - just select first cache */ - cache = list_entry(fscache_cache_list.next, - struct fscache_cache, link); - _leave(" = %s [first]", cache->tag->name); return cache; } +EXPORT_SYMBOL(fscache_acquire_cache); /** - * fscache_init_cache - Initialise a cache record - * @cache: The cache record to be initialised - * @ops: The cache operations to be installed in that record - * @idfmt: Format string to define identifier - * @...: sprintf-style arguments + * fscache_put_cache - Release a cache-level cookie. + * @cache: The cache cookie to be released + * @where: An indication of where the release happened * - * Initialise a record of a cache and fill in the name. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. + * Release the caller's reference on a cache-level cookie. The @where + * indication should give information about the circumstances in which the call + * occurs and will be logged through a tracepoint. */ -void fscache_init_cache(struct fscache_cache *cache, - const struct fscache_cache_ops *ops, - const char *idfmt, - ...) +void fscache_put_cache(struct fscache_cache *cache, + enum fscache_cache_trace where) { - va_list va; + unsigned int debug_id = cache->debug_id; + bool zero; + int ref; - memset(cache, 0, sizeof(*cache)); + if (IS_ERR_OR_NULL(cache)) + return; - cache->ops = ops; + zero = __refcount_dec_and_test(&cache->ref, &ref); + trace_fscache_cache(debug_id, ref - 1, where); - va_start(va, idfmt); - vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va); - va_end(va); + if (zero) { + down_write(&fscache_addremove_sem); + list_del_init(&cache->cache_link); + up_write(&fscache_addremove_sem); + kfree(cache->name); + kfree(cache); + } +} - INIT_WORK(&cache->op_gc, fscache_operation_gc); - INIT_LIST_HEAD(&cache->link); - INIT_LIST_HEAD(&cache->object_list); - INIT_LIST_HEAD(&cache->op_gc_list); - spin_lock_init(&cache->object_list_lock); - spin_lock_init(&cache->op_gc_list_lock); +/** + * fscache_relinquish_cache - Reset cache state and release cookie + * @cache: The cache cookie to be released + * + * Reset the state of a cache and release the caller's reference on a cache + * cookie. + */ +void fscache_relinquish_cache(struct fscache_cache *cache) +{ + enum fscache_cache_trace where = + (cache->state == FSCACHE_CACHE_IS_PREPARING) ? + fscache_cache_put_prep_failed : + fscache_cache_put_relinquish; + + cache->ops = NULL; + cache->cache_priv = NULL; + smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT); + fscache_put_cache(cache, where); } -EXPORT_SYMBOL(fscache_init_cache); +EXPORT_SYMBOL(fscache_relinquish_cache); /** * fscache_add_cache - Declare a cache as being open for business - * @cache: The record describing the cache - * @ifsdef: The record of the cache object describing the top-level index - * @tagname: The tag describing this cache + * @cache: The cache-level cookie representing the cache + * @ops: Table of cache operations to use + * @cache_priv: Private data for the cache record * * Add a cache to the system, making it available for netfs's to use. * @@ -211,93 +231,97 @@ EXPORT_SYMBOL(fscache_init_cache); * description. */ int fscache_add_cache(struct fscache_cache *cache, - struct fscache_object *ifsdef, - const char *tagname) + const struct fscache_cache_ops *ops, + void *cache_priv) { - struct fscache_cache_tag *tag; - - ASSERTCMP(ifsdef->cookie, ==, &fscache_fsdef_index); - BUG_ON(!cache->ops); - BUG_ON(!ifsdef); + int n_accesses; - cache->flags = 0; - ifsdef->event_mask = - ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) & - ~(1 << FSCACHE_OBJECT_EV_CLEARED); - __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags); + _enter("{%s,%s}", ops->name, cache->name); - if (!tagname) - tagname = cache->identifier; + BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); - BUG_ON(!tagname[0]); - - _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname); - - /* we use the cache tag to uniquely identify caches */ - tag = __fscache_lookup_cache_tag(tagname); - if (IS_ERR(tag)) - goto nomem; - - if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags)) - goto tag_in_use; - - cache->kobj = kobject_create_and_add(tagname, fscache_root); - if (!cache->kobj) - goto error; - - ifsdef->cache = cache; - cache->fsdef = ifsdef; + /* Get a ref on the cache cookie and keep its n_accesses counter raised + * by 1 to prevent wakeups from transitioning it to 0 until we're + * withdrawing caching services from it. + */ + n_accesses = atomic_inc_return(&cache->n_accesses); + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, fscache_access_cache_pin); down_write(&fscache_addremove_sem); - tag->cache = cache; - cache->tag = tag; - - /* add the cache to the list */ - list_add(&cache->link, &fscache_cache_list); - - /* add the cache's netfs definition index object to the cache's - * list */ - spin_lock(&cache->object_list_lock); - list_add_tail(&ifsdef->cache_link, &cache->object_list); - spin_unlock(&cache->object_list_lock); - - /* add the cache's netfs definition index object to the top level index - * cookie as a known backing object */ - spin_lock(&fscache_fsdef_index.lock); - - hlist_add_head(&ifsdef->cookie_link, - &fscache_fsdef_index.backing_objects); - - refcount_inc(&fscache_fsdef_index.ref); + cache->ops = ops; + cache->cache_priv = cache_priv; + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_ACTIVE); - /* done */ - spin_unlock(&fscache_fsdef_index.lock); up_write(&fscache_addremove_sem); - - pr_notice("Cache \"%s\" added (type %s)\n", - cache->tag->name, cache->ops->name); - kobject_uevent(cache->kobj, KOBJ_ADD); - - _leave(" = 0 [%s]", cache->identifier); + pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); + _leave(" = 0 [%s]", cache->name); return 0; +} +EXPORT_SYMBOL(fscache_add_cache); -tag_in_use: - pr_err("Cache tag '%s' already in use\n", tagname); - __fscache_release_cache_tag(tag); - _leave(" = -EXIST"); - return -EEXIST; - -error: - __fscache_release_cache_tag(tag); - _leave(" = -EINVAL"); - return -EINVAL; +/** + * fscache_begin_cache_access - Pin a cache so it can be accessed + * @cache: The cache-level cookie + * @why: An indication of the circumstances of the access for tracing + * + * Attempt to pin the cache to prevent it from going away whilst we're + * accessing it and returns true if successful. This works as follows: + * + * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE), + * then we return false to indicate access was not permitted. + * + * (2) If the cache tests as live, then we increment the n_accesses count and + * then recheck the liveness, ending the access if it ceased to be live. + * + * (3) When we end the access, we decrement n_accesses and wake up the any + * waiters if it reaches 0. + * + * (4) Whilst the cache is caching, n_accesses is kept artificially + * incremented to prevent wakeups from happening. + * + * (5) When the cache is taken offline, the state is changed to prevent new + * accesses, n_accesses is decremented and we wait for n_accesses to + * become 0. + */ +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why) +{ + int n_accesses; + + if (!fscache_cache_is_live(cache)) + return false; + + n_accesses = atomic_inc_return(&cache->n_accesses); + smp_mb__after_atomic(); /* Reread live flag after n_accesses */ + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, why); + if (!fscache_cache_is_live(cache)) { + fscache_end_cache_access(cache, fscache_access_unlive); + return false; + } + return true; +} -nomem: - _leave(" = -ENOMEM"); - return -ENOMEM; +/** + * fscache_end_cache_access - Unpin a cache at the end of an access. + * @cache: The cache-level cookie + * @why: An indication of the circumstances of the access for tracing + * + * Unpin a cache after we've accessed it. The @why indicator is merely + * provided for tracing purposes. + */ +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why) +{ + int n_accesses; + + smp_mb__before_atomic(); + n_accesses = atomic_dec_return(&cache->n_accesses); + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, why); + if (n_accesses == 0) + wake_up_var(&cache->n_accesses); } -EXPORT_SYMBOL(fscache_add_cache); /** * fscache_io_error - Note a cache I/O error @@ -311,106 +335,94 @@ EXPORT_SYMBOL(fscache_add_cache); */ void fscache_io_error(struct fscache_cache *cache) { - if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) + if (fscache_set_cache_state_maybe(cache, + FSCACHE_CACHE_IS_ACTIVE, + FSCACHE_CACHE_GOT_IOERROR)) pr_err("Cache '%s' stopped due to I/O error\n", - cache->ops->name); + cache->name); } EXPORT_SYMBOL(fscache_io_error); -/* - * request withdrawal of all the objects in a cache - * - all the objects being withdrawn are moved onto the supplied list +/** + * fscache_withdraw_cache - Withdraw a cache from the active service + * @cache: The cache cookie + * + * Begin the process of withdrawing a cache from service. This stops new + * cache-level and volume-level accesses from taking place and waits for + * currently ongoing cache-level accesses to end. */ -static void fscache_withdraw_all_objects(struct fscache_cache *cache, - struct list_head *dying_objects) +void fscache_withdraw_cache(struct fscache_cache *cache) { - struct fscache_object *object; + int n_accesses; - while (!list_empty(&cache->object_list)) { - spin_lock(&cache->object_list_lock); + pr_notice("Withdrawing cache \"%s\" (%u objs)\n", + cache->name, atomic_read(&cache->object_count)); - if (!list_empty(&cache->object_list)) { - object = list_entry(cache->object_list.next, - struct fscache_object, cache_link); - list_move_tail(&object->cache_link, dying_objects); + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_WITHDRAWN); - _debug("withdraw %x", object->cookie->debug_id); + /* Allow wakeups on dec-to-0 */ + n_accesses = atomic_dec_return(&cache->n_accesses); + trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), + n_accesses, fscache_access_cache_unpin); - /* This must be done under object_list_lock to prevent - * a race with fscache_drop_object(). - */ - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); - } - - spin_unlock(&cache->object_list_lock); - cond_resched(); - } + wait_var_event(&cache->n_accesses, + atomic_read(&cache->n_accesses) == 0); } +EXPORT_SYMBOL(fscache_withdraw_cache); -/** - * fscache_withdraw_cache - Withdraw a cache from the active service - * @cache: The record describing the cache - * - * Withdraw a cache from service, unbinding all its cache objects from the - * netfs cookies they're currently representing. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. +#ifdef CONFIG_PROC_FS +static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; + +/* + * Generate a list of caches in /proc/fs/fscache/caches */ -void fscache_withdraw_cache(struct fscache_cache *cache) +static int fscache_caches_seq_show(struct seq_file *m, void *v) { - LIST_HEAD(dying_objects); + struct fscache_cache *cache; - _enter(""); + if (v == &fscache_caches) { + seq_puts(m, + "CACHE REF VOLS OBJS ACCES S NAME\n" + "======== ===== ===== ===== ===== = ===============\n" + ); + return 0; + } - pr_notice("Withdrawing cache \"%s\"\n", - cache->tag->name); + cache = list_entry(v, struct fscache_cache, cache_link); + seq_printf(m, + "%08x %5d %5d %5d %5d %c %s\n", + cache->debug_id, + refcount_read(&cache->ref), + atomic_read(&cache->n_volumes), + atomic_read(&cache->object_count), + atomic_read(&cache->n_accesses), + fscache_cache_states[cache->state], + cache->name ?: "-"); + return 0; +} - /* make the cache unavailable for cookie acquisition */ - if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) - BUG(); +static void *fscache_caches_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(fscache_addremove_sem) +{ + down_read(&fscache_addremove_sem); + return seq_list_start_head(&fscache_caches, *_pos); +} - down_write(&fscache_addremove_sem); - list_del_init(&cache->link); - cache->tag->cache = NULL; - up_write(&fscache_addremove_sem); +static void *fscache_caches_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &fscache_caches, _pos); +} - /* make sure all pages pinned by operations on behalf of the netfs are - * written to disk */ - fscache_stat(&fscache_n_cop_sync_cache); - cache->ops->sync_cache(cache); - fscache_stat_d(&fscache_n_cop_sync_cache); - - /* dissociate all the netfs pages backed by this cache from the block - * mappings in the cache */ - fscache_stat(&fscache_n_cop_dissociate_pages); - cache->ops->dissociate_pages(cache); - fscache_stat_d(&fscache_n_cop_dissociate_pages); - - /* we now have to destroy all the active objects pertaining to this - * cache - which we do by passing them off to thread pool to be - * disposed of */ - _debug("destroy"); - - fscache_withdraw_all_objects(cache, &dying_objects); - - /* wait for all extant objects to finish their outstanding operations - * and go away */ - _debug("wait for finish"); - wait_event(fscache_cache_cleared_wq, - atomic_read(&cache->object_count) == 0); - _debug("wait for clearance"); - wait_event(fscache_cache_cleared_wq, - list_empty(&cache->object_list)); - _debug("cleared"); - ASSERT(list_empty(&dying_objects)); - - kobject_put(cache->kobj); - - clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags); - fscache_release_cache_tag(cache->tag); - cache->tag = NULL; - - _leave(""); +static void fscache_caches_seq_stop(struct seq_file *m, void *v) + __releases(fscache_addremove_sem) +{ + up_read(&fscache_addremove_sem); } -EXPORT_SYMBOL(fscache_withdraw_cache); + +const struct seq_operations fscache_caches_seq_ops = { + .start = fscache_caches_seq_start, + .next = fscache_caches_seq_next, + .stop = fscache_caches_seq_stop, + .show = fscache_caches_seq_show, +}; +#endif /* CONFIG_PROC_FS */ diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index cd42be646ed3..9bb1ab5fe5ed 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* netfs cookie management * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/filesystems/caching/netfs-api.rst for more information on @@ -15,70 +15,258 @@ struct kmem_cache *fscache_cookie_jar; -static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); +static void fscache_cookie_lru_timed_out(struct timer_list *timer); +static void fscache_cookie_lru_worker(struct work_struct *work); +static void fscache_cookie_worker(struct work_struct *work); +static void fscache_unhash_cookie(struct fscache_cookie *cookie); +static void fscache_perform_invalidation(struct fscache_cookie *cookie); #define fscache_cookie_hash_shift 15 static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift]; static LIST_HEAD(fscache_cookies); static DEFINE_RWLOCK(fscache_cookies_lock); - -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, - loff_t object_size); -static int fscache_alloc_object(struct fscache_cache *cache, - struct fscache_cookie *cookie); -static int fscache_attach_object(struct fscache_cookie *cookie, - struct fscache_object *object); - -static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) +static LIST_HEAD(fscache_cookie_lru); +static DEFINE_SPINLOCK(fscache_cookie_lru_lock); +DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); +static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); +static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; +unsigned int fscache_lru_cookie_timeout = 10 * HZ; + +void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { - struct fscache_object *object; - struct hlist_node *o; const u8 *k; - unsigned loop; - pr_err("%c-cookie c=%08x [p=%08x fl=%lx nc=%u na=%u]\n", + pr_err("%c-cookie c=%08x [fl=%lx na=%u nA=%u s=%c]\n", prefix, cookie->debug_id, - cookie->parent ? cookie->parent->debug_id : 0, cookie->flags, - atomic_read(&cookie->n_children), - atomic_read(&cookie->n_active)); - pr_err("%c-cookie d=%p{%s} n=%p\n", + atomic_read(&cookie->n_active), + atomic_read(&cookie->n_accesses), + fscache_cookie_states[cookie->state]); + pr_err("%c-cookie V=%08x [%s]\n", prefix, - cookie->def, - cookie->def ? cookie->def->name : "?", - cookie->netfs_data); - - o = READ_ONCE(cookie->backing_objects.first); - if (o) { - object = hlist_entry(o, struct fscache_object, cookie_link); - pr_err("%c-cookie o=%u\n", prefix, object->debug_id); - } + cookie->volume->debug_id, + cookie->volume->key); - pr_err("%c-key=[%u] '", prefix, cookie->key_len); k = (cookie->key_len <= sizeof(cookie->inline_key)) ? cookie->inline_key : cookie->key; - for (loop = 0; loop < cookie->key_len; loop++) - pr_cont("%02x", k[loop]); - pr_cont("'\n"); + pr_err("%c-key=[%u] '%*phN'\n", prefix, cookie->key_len, cookie->key_len, k); } -void fscache_free_cookie(struct fscache_cookie *cookie) +static void fscache_free_cookie(struct fscache_cookie *cookie) { - if (cookie) { - BUG_ON(!hlist_empty(&cookie->backing_objects)); - write_lock(&fscache_cookies_lock); - list_del(&cookie->proc_link); - write_unlock(&fscache_cookies_lock); - if (cookie->aux_len > sizeof(cookie->inline_aux)) - kfree(cookie->aux); - if (cookie->key_len > sizeof(cookie->inline_key)) - kfree(cookie->key); - kmem_cache_free(fscache_cookie_jar, cookie); + if (WARN_ON_ONCE(!list_empty(&cookie->commit_link))) { + spin_lock(&fscache_cookie_lru_lock); + list_del_init(&cookie->commit_link); + spin_unlock(&fscache_cookie_lru_lock); + fscache_stat_d(&fscache_n_cookies_lru); + fscache_stat(&fscache_n_cookies_lru_removed); + } + + if (WARN_ON_ONCE(test_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags))) { + fscache_print_cookie(cookie, 'F'); + return; } + + write_lock(&fscache_cookies_lock); + list_del(&cookie->proc_link); + write_unlock(&fscache_cookies_lock); + if (cookie->aux_len > sizeof(cookie->inline_aux)) + kfree(cookie->aux); + if (cookie->key_len > sizeof(cookie->inline_key)) + kfree(cookie->key); + fscache_stat_d(&fscache_n_cookies); + kmem_cache_free(fscache_cookie_jar, cookie); +} + +static void __fscache_queue_cookie(struct fscache_cookie *cookie) +{ + if (!queue_work(fscache_wq, &cookie->work)) + fscache_put_cookie(cookie, fscache_cookie_put_over_queued); +} + +static void fscache_queue_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + fscache_get_cookie(cookie, where); + __fscache_queue_cookie(cookie); } /* + * Initialise the access gate on a cookie by setting a flag to prevent the + * state machine from being queued when the access counter transitions to 0. + * We're only interested in this when we withdraw caching services from the + * cookie. + */ +static void fscache_init_access_gate(struct fscache_cookie *cookie) +{ + int n_accesses; + + n_accesses = atomic_read(&cookie->n_accesses); + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, fscache_access_cache_pin); + set_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags); +} + +/** + * fscache_end_cookie_access - Unpin a cache at the end of an access. + * @cookie: A data file cookie + * @why: An indication of the circumstances of the access for tracing + * + * Unpin a cache cookie after we've accessed it and bring a deferred + * relinquishment or withdrawal state into effect. + * + * The @why indicator is provided for tracing purposes. + */ +void fscache_end_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + smp_mb__before_atomic(); + n_accesses = atomic_dec_return(&cookie->n_accesses); + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, why); + if (n_accesses == 0 && + !test_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags)) + fscache_queue_cookie(cookie, fscache_cookie_get_end_access); +} +EXPORT_SYMBOL(fscache_end_cookie_access); + +/* + * Pin the cache behind a cookie so that we can access it. + */ +static void __fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + n_accesses = atomic_inc_return(&cookie->n_accesses); + smp_mb__after_atomic(); /* (Future) read state after is-caching. + * Reread n_accesses after is-caching + */ + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, why); +} + +/** + * fscache_begin_cookie_access - Pin a cache so data can be accessed + * @cookie: A data file cookie + * @why: An indication of the circumstances of the access for tracing + * + * Attempt to pin the cache to prevent it from going away whilst we're + * accessing data and returns true if successful. This works as follows: + * + * (1) If the cookie is not being cached (ie. FSCACHE_COOKIE_IS_CACHING is not + * set), we return false to indicate access was not permitted. + * + * (2) If the cookie is being cached, we increment its n_accesses count and + * then recheck the IS_CACHING flag, ending the access if it got cleared. + * + * (3) When we end the access, we decrement the cookie's n_accesses and wake + * up the any waiters if it reaches 0. + * + * (4) Whilst the cookie is actively being cached, its n_accesses is kept + * artificially incremented to prevent wakeups from happening. + * + * (5) When the cache is taken offline or if the cookie is culled, the flag is + * cleared to prevent new accesses, the cookie's n_accesses is decremented + * and we wait for it to become 0. + * + * The @why indicator are merely provided for tracing purposes. + */ +bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) + return false; + __fscache_begin_cookie_access(cookie, why); + if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags) || + !fscache_cache_is_live(cookie->volume->cache)) { + fscache_end_cookie_access(cookie, fscache_access_unlive); + return false; + } + return true; +} + +static inline void wake_up_cookie_state(struct fscache_cookie *cookie) +{ + /* Use a barrier to ensure that waiters see the state variable + * change, as spin_unlock doesn't guarantee a barrier. + * + * See comments over wake_up_bit() and waitqueue_active(). + */ + smp_mb(); + wake_up_var(&cookie->state); +} + +/* + * Change the state a cookie is at and wake up anyone waiting for that. Impose + * an ordering between the stuff stored in the cookie and the state member. + * Paired with fscache_cookie_state(). + */ +static void __fscache_set_cookie_state(struct fscache_cookie *cookie, + enum fscache_cookie_state state) +{ + smp_store_release(&cookie->state, state); +} + +static void fscache_set_cookie_state(struct fscache_cookie *cookie, + enum fscache_cookie_state state) +{ + spin_lock(&cookie->lock); + __fscache_set_cookie_state(cookie, state); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); +} + +/** + * fscache_cookie_lookup_negative - Note negative lookup + * @cookie: The cookie that was being looked up + * + * Note that some part of the metadata path in the cache doesn't exist and so + * we can release any waiting readers in the certain knowledge that there's + * nothing for them to actually read. + * + * This function uses no locking and must only be called from the state machine. + */ +void fscache_cookie_lookup_negative(struct fscache_cookie *cookie) +{ + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_CREATING); +} +EXPORT_SYMBOL(fscache_cookie_lookup_negative); + +/** + * fscache_resume_after_invalidation - Allow I/O to resume after invalidation + * @cookie: The cookie that was invalidated + * + * Tell fscache that invalidation is sufficiently complete that I/O can be + * allowed again. + */ +void fscache_resume_after_invalidation(struct fscache_cookie *cookie) +{ + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); +} +EXPORT_SYMBOL(fscache_resume_after_invalidation); + +/** + * fscache_caching_failed - Report that a failure stopped caching on a cookie + * @cookie: The cookie that was affected + * + * Tell fscache that caching on a cookie needs to be stopped due to some sort + * of failure. + * + * This function uses no locking and must only be called from the state machine. + */ +void fscache_caching_failed(struct fscache_cookie *cookie) +{ + clear_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags); + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_FAILED); +} +EXPORT_SYMBOL(fscache_caching_failed); + +/* * Set the index key in a cookie. The cookie struct has space for a 16-byte * key plus length and hash, but if that's not big enough, it's instead a * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then @@ -87,38 +275,35 @@ void fscache_free_cookie(struct fscache_cookie *cookie) static int fscache_set_key(struct fscache_cookie *cookie, const void *index_key, size_t index_key_len) { - u32 *buf; - int bufs; + void *buf; + size_t buf_size; - bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); + buf_size = round_up(index_key_len, sizeof(__le32)); if (index_key_len > sizeof(cookie->inline_key)) { - buf = kcalloc(bufs, sizeof(*buf), GFP_KERNEL); + buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOMEM; cookie->key = buf; } else { - buf = (u32 *)cookie->inline_key; + buf = cookie->inline_key; } memcpy(buf, index_key, index_key_len); - cookie->key_hash = fscache_hash(0, buf, bufs); + cookie->key_hash = fscache_hash(cookie->volume->key_hash, + buf, buf_size); return 0; } -static long fscache_compare_cookie(const struct fscache_cookie *a, - const struct fscache_cookie *b) +static bool fscache_cookie_same(const struct fscache_cookie *a, + const struct fscache_cookie *b) { const void *ka, *kb; - if (a->key_hash != b->key_hash) - return (long)a->key_hash - (long)b->key_hash; - if (a->parent != b->parent) - return (long)a->parent - (long)b->parent; - if (a->key_len != b->key_len) - return (long)a->key_len - (long)b->key_len; - if (a->type != b->type) - return (long)a->type - (long)b->type; + if (a->key_hash != b->key_hash || + a->volume != b->volume || + a->key_len != b->key_len) + return false; if (a->key_len <= sizeof(a->inline_key)) { ka = &a->inline_key; @@ -127,7 +312,7 @@ static long fscache_compare_cookie(const struct fscache_cookie *a, ka = a->key; kb = b->key; } - return memcmp(ka, kb, a->key_len); + return memcmp(ka, kb, a->key_len) == 0; } static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1); @@ -135,12 +320,11 @@ static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1); /* * Allocate a cookie. */ -struct fscache_cookie *fscache_alloc_cookie( - struct fscache_cookie *parent, - const struct fscache_cookie_def *def, +static struct fscache_cookie *fscache_alloc_cookie( + struct fscache_volume *volume, + u8 advice, const void *index_key, size_t index_key_len, const void *aux_data, size_t aux_data_len, - void *netfs_data, loff_t object_size) { struct fscache_cookie *cookie; @@ -149,9 +333,15 @@ struct fscache_cookie *fscache_alloc_cookie( cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); if (!cookie) return NULL; + fscache_stat(&fscache_n_cookies); - cookie->key_len = index_key_len; - cookie->aux_len = aux_data_len; + cookie->volume = volume; + cookie->advice = advice; + cookie->key_len = index_key_len; + cookie->aux_len = aux_data_len; + cookie->object_size = object_size; + if (object_size == 0) + __set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); if (fscache_set_key(cookie, index_key, index_key_len) < 0) goto nomem; @@ -165,30 +355,16 @@ struct fscache_cookie *fscache_alloc_cookie( } refcount_set(&cookie->ref, 1); - atomic_set(&cookie->n_children, 0); cookie->debug_id = atomic_inc_return(&fscache_cookie_debug_id); - - /* We keep the active count elevated until relinquishment to prevent an - * attempt to wake up every time the object operations queue quiesces. - */ - atomic_set(&cookie->n_active, 1); - - cookie->def = def; - cookie->parent = parent; - cookie->netfs_data = netfs_data; - cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); - cookie->type = def->type; spin_lock_init(&cookie->lock); - spin_lock_init(&cookie->stores_lock); - INIT_HLIST_HEAD(&cookie->backing_objects); - - /* radix tree insertion won't use the preallocation pool unless it's - * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_LIST_HEAD(&cookie->commit_link); + INIT_WORK(&cookie->work, fscache_cookie_worker); + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); write_lock(&fscache_cookies_lock); list_add_tail(&cookie->proc_link, &fscache_cookies); write_unlock(&fscache_cookies_lock); + fscache_see_cookie(cookie, fscache_cookie_new_acquire); return cookie; nomem: @@ -196,13 +372,28 @@ nomem: return NULL; } +static void fscache_wait_on_collision(struct fscache_cookie *candidate, + struct fscache_cookie *wait_for) +{ + enum fscache_cookie_state *statep = &wait_for->state; + + wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED, + 20 * HZ); + if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) { + pr_notice("Potential collision c=%08x old: c=%08x", + candidate->debug_id, wait_for->debug_id); + wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED); + } +} + /* * Attempt to insert the new cookie into the hash. If there's a collision, we - * return the old cookie if it's not in use and an error otherwise. + * wait for the old cookie to complete if it's being relinquished and an error + * otherwise. */ -struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) +static bool fscache_hash_cookie(struct fscache_cookie *candidate) { - struct fscache_cookie *cursor; + struct fscache_cookie *cursor, *wait_for = NULL; struct hlist_bl_head *h; struct hlist_bl_node *p; unsigned int bucket; @@ -212,64 +403,53 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) hlist_bl_lock(h); hlist_bl_for_each_entry(cursor, p, h, hash_link) { - if (fscache_compare_cookie(candidate, cursor) == 0) - goto collision; + if (fscache_cookie_same(candidate, cursor)) { + if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cursor->flags)) + goto collision; + wait_for = fscache_get_cookie(cursor, + fscache_cookie_get_hash_collision); + break; + } } - __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags); - fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent); - atomic_inc(&candidate->parent->n_children); + fscache_get_volume(candidate->volume, fscache_volume_get_cookie); + atomic_inc(&candidate->volume->n_cookies); hlist_bl_add_head(&candidate->hash_link, h); + set_bit(FSCACHE_COOKIE_IS_HASHED, &candidate->flags); hlist_bl_unlock(h); - return candidate; -collision: - if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) { - trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref), - fscache_cookie_collision); - pr_err("Duplicate cookie detected\n"); - fscache_print_cookie(cursor, 'O'); - fscache_print_cookie(candidate, 'N'); - hlist_bl_unlock(h); - return NULL; + if (wait_for) { + fscache_wait_on_collision(candidate, wait_for); + fscache_put_cookie(wait_for, fscache_cookie_put_hash_collision); } + return true; - fscache_cookie_get(cursor, fscache_cookie_get_reacquire); +collision: + trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref), + fscache_cookie_collision); + pr_err("Duplicate cookie detected\n"); + fscache_print_cookie(cursor, 'O'); + fscache_print_cookie(candidate, 'N'); hlist_bl_unlock(h); - return cursor; + return false; } /* - * request a cookie to represent an object (index, datafile, xattr, etc) - * - parent specifies the parent object - * - the top level index cookie for each netfs is stored in the fscache_netfs - * struct upon registration - * - def points to the definition - * - the netfs_data will be passed to the functions pointed to in *def - * - all attached caches will be searched to see if they contain this object - * - index objects aren't stored on disk until there's a dependent file that - * needs storing - * - other objects are stored in a selected cache immediately, and all the - * indices forming the path to it are instantiated if necessary - * - we never let on to the netfs about errors - * - we may set a negative cookie pointer, but that's okay + * Request a cookie to represent a data storage object within a volume. + * + * We never let on to the netfs about errors. We may set a negative cookie + * pointer, but that's okay */ struct fscache_cookie *__fscache_acquire_cookie( - struct fscache_cookie *parent, - const struct fscache_cookie_def *def, + struct fscache_volume *volume, + u8 advice, const void *index_key, size_t index_key_len, const void *aux_data, size_t aux_data_len, - void *netfs_data, - loff_t object_size, - bool enable) + loff_t object_size) { - struct fscache_cookie *candidate, *cookie; - - BUG_ON(!def); + struct fscache_cookie *cookie; - _enter("{%s},{%s},%p,%u", - parent ? (char *) parent->def->name : "<no-parent>", - def->name, netfs_data, enable); + _enter("V=%x", volume->debug_id); if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) return NULL; @@ -280,563 +460,440 @@ struct fscache_cookie *__fscache_acquire_cookie( fscache_stat(&fscache_n_acquires); - /* if there's no parent cookie, then we don't create one here either */ - if (!parent) { - fscache_stat(&fscache_n_acquires_null); - _leave(" [no parent]"); - return NULL; - } - - /* validate the definition */ - BUG_ON(!def->name[0]); - - BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && - parent->type != FSCACHE_COOKIE_TYPE_INDEX); - - candidate = fscache_alloc_cookie(parent, def, - index_key, index_key_len, - aux_data, aux_data_len, - netfs_data, object_size); - if (!candidate) { + cookie = fscache_alloc_cookie(volume, advice, + index_key, index_key_len, + aux_data, aux_data_len, + object_size); + if (!cookie) { fscache_stat(&fscache_n_acquires_oom); - _leave(" [ENOMEM]"); return NULL; } - cookie = fscache_hash_cookie(candidate); - if (!cookie) { - trace_fscache_cookie(candidate->debug_id, 1, - fscache_cookie_discard); - goto out; - } - - if (cookie == candidate) - candidate = NULL; - - switch (cookie->type) { - case FSCACHE_COOKIE_TYPE_INDEX: - fscache_stat(&fscache_n_cookie_index); - break; - case FSCACHE_COOKIE_TYPE_DATAFILE: - fscache_stat(&fscache_n_cookie_data); - break; - default: - fscache_stat(&fscache_n_cookie_special); - break; + if (!fscache_hash_cookie(cookie)) { + fscache_see_cookie(cookie, fscache_cookie_discard); + fscache_free_cookie(cookie); + return NULL; } trace_fscache_acquire(cookie); - - if (enable) { - /* if the object is an index then we need do nothing more here - * - we create indices on disk when we need them as an index - * may exist in multiple caches */ - if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) { - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } else { - atomic_dec(&parent->n_children); - fscache_cookie_put(cookie, - fscache_cookie_put_acquire_nobufs); - fscache_stat(&fscache_n_acquires_nobufs); - _leave(" = NULL"); - return NULL; - } - } else { - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } - } - fscache_stat(&fscache_n_acquires_ok); - -out: - fscache_free_cookie(candidate); + _leave(" = c=%08x", cookie->debug_id); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); /* - * Enable a cookie to permit it to accept new operations. + * Prepare a cache object to be written to. */ -void __fscache_enable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - loff_t object_size, - bool (*can_enable)(void *data), - void *data) +static void fscache_prepare_to_write(struct fscache_cookie *cookie) { - _enter("%x", cookie->debug_id); - - trace_fscache_enable(cookie); - - wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - TASK_UNINTERRUPTIBLE); - - fscache_update_aux(cookie, aux_data); - - if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) - goto out_unlock; - - if (can_enable && !can_enable(data)) { - /* The netfs decided it didn't want to enable after all */ - } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { - /* Wait for outstanding disablement to complete */ - __fscache_wait_on_invalidate(cookie); - - if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } else { - set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); - } - -out_unlock: - clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); + cookie->volume->cache->ops->prepare_to_write(cookie); } -EXPORT_SYMBOL(__fscache_enable_cookie); /* - * acquire a non-index cookie - * - this must make sure the index chain is instantiated and instantiate the - * object representation too + * Look up a cookie in the cache. */ -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, - loff_t object_size) +static void fscache_perform_lookup(struct fscache_cookie *cookie) { - struct fscache_object *object; - struct fscache_cache *cache; - int ret; + enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed; + bool need_withdraw = false; _enter(""); - set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - - /* now we need to see whether the backing objects for this cookie yet - * exist, if not there'll be nothing to search */ - down_read(&fscache_addremove_sem); - - if (list_empty(&fscache_cache_list)) { - up_read(&fscache_addremove_sem); - _leave(" = 0 [no caches]"); - return 0; - } - - /* select a cache in which to store the object */ - cache = fscache_select_cache_for_object(cookie->parent); - if (!cache) { - up_read(&fscache_addremove_sem); - fscache_stat(&fscache_n_acquires_no_cache); - _leave(" = -ENOMEDIUM [no cache]"); - return -ENOMEDIUM; - } - - _debug("cache %s", cache->tag->name); - - set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - - /* ask the cache to allocate objects for this cookie and its parent - * chain */ - ret = fscache_alloc_object(cache, cookie); - if (ret < 0) { - up_read(&fscache_addremove_sem); - _leave(" = %d", ret); - return ret; - } - - spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) { - spin_unlock(&cookie->lock); - goto unavailable; + if (!cookie->volume->cache_priv) { + fscache_create_volume(cookie->volume, true); + if (!cookie->volume->cache_priv) { + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); + goto out; + } } - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - fscache_set_store_limit(object, object_size); - - /* initiate the process of looking up all the objects in the chain - * (done by fscache_initialise_object()) */ - fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD); - - spin_unlock(&cookie->lock); - - /* we may be required to wait for lookup to complete at this point */ - if (!fscache_defer_lookup) { - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - TASK_UNINTERRUPTIBLE); - if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) - goto unavailable; + if (!cookie->volume->cache->ops->lookup_cookie(cookie)) { + if (cookie->state != FSCACHE_COOKIE_STATE_FAILED) + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); + need_withdraw = true; + _leave(" [fail]"); + goto out; } - up_read(&fscache_addremove_sem); - _leave(" = 0 [deferred]"); - return 0; + fscache_see_cookie(cookie, fscache_cookie_see_active); + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + trace = fscache_access_lookup_cookie_end; -unavailable: - up_read(&fscache_addremove_sem); - _leave(" = -ENOBUFS"); - return -ENOBUFS; +out: + fscache_end_cookie_access(cookie, trace); + if (need_withdraw) + fscache_withdraw_cookie(cookie); + fscache_end_volume_access(cookie->volume, cookie, trace); } /* - * recursively allocate cache object records for a cookie/cache combination - * - caller must be holding the addremove sem + * Begin the process of looking up a cookie. We offload the actual process to + * a worker thread. */ -static int fscache_alloc_object(struct fscache_cache *cache, - struct fscache_cookie *cookie) +static bool fscache_begin_lookup(struct fscache_cookie *cookie, bool will_modify) { - struct fscache_object *object; - int ret; - - _enter("%s,%x{%s}", cache->tag->name, cookie->debug_id, cookie->def->name); - - spin_lock(&cookie->lock); - hlist_for_each_entry(object, &cookie->backing_objects, - cookie_link) { - if (object->cache == cache) - goto object_already_extant; + if (will_modify) { + set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags); + set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); } - spin_unlock(&cookie->lock); - - /* ask the cache to allocate an object (we may end up with duplicate - * objects at this stage, but we sort that out later) */ - fscache_stat(&fscache_n_cop_alloc_object); - object = cache->ops->alloc_object(cache, cookie); - fscache_stat_d(&fscache_n_cop_alloc_object); - if (IS_ERR(object)) { - fscache_stat(&fscache_n_object_no_alloc); - ret = PTR_ERR(object); - goto error; - } - - ASSERTCMP(object->cookie, ==, cookie); - fscache_stat(&fscache_n_object_alloc); - - object->debug_id = atomic_inc_return(&fscache_object_debug_id); - - _debug("ALLOC OBJ%x: %s {%lx}", - object->debug_id, cookie->def->name, object->events); - - ret = fscache_alloc_object(cache, cookie->parent); - if (ret < 0) - goto error_put; - - /* only attach if we managed to allocate all we needed, otherwise - * discard the object we just allocated and instead use the one - * attached to the cookie */ - if (fscache_attach_object(cookie, object) < 0) { - fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object, fscache_obj_put_attach_fail); - fscache_stat_d(&fscache_n_cop_put_object); - } - - _leave(" = 0"); - return 0; - -object_already_extant: - ret = -ENOBUFS; - if (fscache_object_is_dying(object) || - fscache_cache_is_broken(object)) { - spin_unlock(&cookie->lock); - goto error; - } - spin_unlock(&cookie->lock); - _leave(" = 0 [found]"); - return 0; - -error_put: - fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object, fscache_obj_put_alloc_fail); - fscache_stat_d(&fscache_n_cop_put_object); -error: - _leave(" = %d", ret); - return ret; + if (!fscache_begin_volume_access(cookie->volume, cookie, + fscache_access_lookup_cookie)) + return false; + + __fscache_begin_cookie_access(cookie, fscache_access_lookup_cookie); + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_LOOKING_UP); + set_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags); + set_bit(FSCACHE_COOKIE_HAS_BEEN_CACHED, &cookie->flags); + return true; } /* - * attach a cache object to a cookie + * Start using the cookie for I/O. This prevents the backing object from being + * reaped by VM pressure. */ -static int fscache_attach_object(struct fscache_cookie *cookie, - struct fscache_object *object) +void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) { - struct fscache_object *p; - struct fscache_cache *cache = object->cache; - int ret; + enum fscache_cookie_state state; + bool queue = false; + int n_active; - _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + _enter("c=%08x", cookie->debug_id); - ASSERTCMP(object->cookie, ==, cookie); + if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), + "Trying to use relinquished cookie\n")) + return; spin_lock(&cookie->lock); - /* there may be multiple initial creations of this object, but we only - * want one */ - ret = -EEXIST; - hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { - if (p->cache == object->cache) { - if (fscache_object_is_dying(p)) - ret = -ENOBUFS; - goto cant_attach_object; - } - } + n_active = atomic_inc_return(&cookie->n_active); + trace_fscache_active(cookie->debug_id, refcount_read(&cookie->ref), + n_active, atomic_read(&cookie->n_accesses), + will_modify ? + fscache_active_use_modify : fscache_active_use); + +again: + state = fscache_cookie_state(cookie); + switch (state) { + case FSCACHE_COOKIE_STATE_QUIESCENT: + queue = fscache_begin_lookup(cookie, will_modify); + break; - /* pin the parent object */ - spin_lock_nested(&cookie->parent->lock, 1); - hlist_for_each_entry(p, &cookie->parent->backing_objects, - cookie_link) { - if (p->cache == object->cache) { - if (fscache_object_is_dying(p)) { - ret = -ENOBUFS; - spin_unlock(&cookie->parent->lock); - goto cant_attach_object; - } - object->parent = p; - spin_lock(&p->lock); - p->n_children++; - spin_unlock(&p->lock); - break; + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_CREATING: + if (will_modify) + set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags); + break; + case FSCACHE_COOKIE_STATE_ACTIVE: + case FSCACHE_COOKIE_STATE_INVALIDATING: + if (will_modify && + !test_and_set_bit(FSCACHE_COOKIE_LOCAL_WRITE, &cookie->flags)) { + set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); + queue = true; } - } - spin_unlock(&cookie->parent->lock); - - /* attach to the cache's object list */ - if (list_empty(&object->cache_link)) { - spin_lock(&cache->object_list_lock); - list_add(&object->cache_link, &cache->object_list); - spin_unlock(&cache->object_list_lock); - } - - /* Attach to the cookie. The object already has a ref on it. */ - hlist_add_head(&object->cookie_link, &cookie->backing_objects); - ret = 0; - -cant_attach_object: - spin_unlock(&cookie->lock); - _leave(" = %d", ret); - return ret; -} - -/* - * Invalidate an object. Callable with spinlocks held. - */ -void __fscache_invalidate(struct fscache_cookie *cookie) -{ - struct fscache_object *object; - - _enter("{%s}", cookie->def->name); - - fscache_stat(&fscache_n_invalidates); + break; - /* Only permit invalidation of data files. Invalidating an index will - * require the caller to release all its attachments to the tree rooted - * there, and if it's doing that, it may as well just retire the - * cookie. - */ - ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + case FSCACHE_COOKIE_STATE_FAILED: + case FSCACHE_COOKIE_STATE_WITHDRAWING: + break; - /* If there's an object, we tell the object state machine to handle the - * invalidation on our behalf, otherwise there's nothing to do. - */ - if (!hlist_empty(&cookie->backing_objects)) { + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + spin_unlock(&cookie->lock); + wait_var_event(&cookie->state, + fscache_cookie_state(cookie) != + FSCACHE_COOKIE_STATE_LRU_DISCARDING); spin_lock(&cookie->lock); + goto again; - if (fscache_cookie_enabled(cookie) && - !hlist_empty(&cookie->backing_objects) && - !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, - &cookie->flags)) { - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, - cookie_link); - if (fscache_object_is_live(object)) - fscache_raise_event( - object, FSCACHE_OBJECT_EV_INVALIDATE); - } - - spin_unlock(&cookie->lock); + case FSCACHE_COOKIE_STATE_DROPPED: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + WARN(1, "Can't use cookie in state %u\n", state); + break; } + spin_unlock(&cookie->lock); + if (queue) + fscache_queue_cookie(cookie, fscache_cookie_get_use_work); _leave(""); } -EXPORT_SYMBOL(__fscache_invalidate); +EXPORT_SYMBOL(__fscache_use_cookie); -/* - * Wait for object invalidation to complete. - */ -void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) +static void fscache_unuse_cookie_locked(struct fscache_cookie *cookie) { - _enter("%x", cookie->debug_id); + clear_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags); + if (!test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) + return; - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, - TASK_UNINTERRUPTIBLE); + cookie->unused_at = jiffies; + spin_lock(&fscache_cookie_lru_lock); + if (list_empty(&cookie->commit_link)) { + fscache_get_cookie(cookie, fscache_cookie_get_lru); + fscache_stat(&fscache_n_cookies_lru); + } + list_move_tail(&cookie->commit_link, &fscache_cookie_lru); - _leave(""); + spin_unlock(&fscache_cookie_lru_lock); + timer_reduce(&fscache_cookie_lru_timer, + jiffies + fscache_lru_cookie_timeout); } -EXPORT_SYMBOL(__fscache_wait_on_invalidate); /* - * update the index entries backing a cookie + * Stop using the cookie for I/O. */ -void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) +void __fscache_unuse_cookie(struct fscache_cookie *cookie, + const void *aux_data, const loff_t *object_size) { - struct fscache_object *object; - - fscache_stat(&fscache_n_updates); - - if (!cookie) { - fscache_stat(&fscache_n_updates_null); - _leave(" [no cookie]"); + unsigned int debug_id = cookie->debug_id; + unsigned int r = refcount_read(&cookie->ref); + unsigned int a = atomic_read(&cookie->n_accesses); + unsigned int c; + + if (aux_data || object_size) + __fscache_update_cookie(cookie, aux_data, object_size); + + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + c = atomic_fetch_add_unless(&cookie->n_active, -1, 1); + if (c != 1) { + trace_fscache_active(debug_id, r, c - 1, a, fscache_active_unuse); return; } - _enter("{%s}", cookie->def->name); - spin_lock(&cookie->lock); - - fscache_update_aux(cookie, aux_data); - - if (fscache_cookie_enabled(cookie)) { - /* update the index entry on disk in each cache backing this - * cookie. - */ - hlist_for_each_entry(object, - &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); - } - } - + r = refcount_read(&cookie->ref); + a = atomic_read(&cookie->n_accesses); + c = atomic_dec_return(&cookie->n_active); + trace_fscache_active(debug_id, r, c, a, fscache_active_unuse); + if (c == 0) + fscache_unuse_cookie_locked(cookie); spin_unlock(&cookie->lock); - _leave(""); } -EXPORT_SYMBOL(__fscache_update_cookie); +EXPORT_SYMBOL(__fscache_unuse_cookie); /* - * Disable a cookie to stop it from accepting new requests from the netfs. + * Perform work upon the cookie, such as committing its cache state, + * relinquishing it or withdrawing the backing cache. We're protected from the + * cache going away under us as object withdrawal must come through this + * non-reentrant work item. */ -void __fscache_disable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool invalidate) +static void fscache_cookie_state_machine(struct fscache_cookie *cookie) { - struct fscache_object *object; - bool awaken = false; + enum fscache_cookie_state state; + bool wake = false; - _enter("%x,%u", cookie->debug_id, invalidate); + _enter("c=%x", cookie->debug_id); - trace_fscache_disable(cookie); - - ASSERTCMP(atomic_read(&cookie->n_active), >, 0); - - if (atomic_read(&cookie->n_children) != 0) { - pr_err("Cookie '%s' still has children\n", - cookie->def->name); - BUG(); - } +again: + spin_lock(&cookie->lock); +again_locked: + state = cookie->state; + switch (state) { + case FSCACHE_COOKIE_STATE_QUIESCENT: + /* The QUIESCENT state is jumped to the LOOKING_UP state by + * fscache_use_cookie(). + */ - wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - TASK_UNINTERRUPTIBLE); + if (atomic_read(&cookie->n_accesses) == 0 && + test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_RELINQUISHING); + wake = true; + goto again_locked; + } + break; - fscache_update_aux(cookie, aux_data); + case FSCACHE_COOKIE_STATE_LOOKING_UP: + spin_unlock(&cookie->lock); + fscache_init_access_gate(cookie); + fscache_perform_lookup(cookie); + goto again; - if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) - goto out_unlock_enable; + case FSCACHE_COOKIE_STATE_INVALIDATING: + spin_unlock(&cookie->lock); + fscache_perform_invalidation(cookie); + goto again; + + case FSCACHE_COOKIE_STATE_ACTIVE: + if (test_and_clear_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags)) { + spin_unlock(&cookie->lock); + fscache_prepare_to_write(cookie); + spin_lock(&cookie->lock); + } + if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_LRU_DISCARDING); + wake = true; + goto again_locked; + } + fallthrough; - /* If the cookie is being invalidated, wait for that to complete first - * so that we can reuse the flag. - */ - __fscache_wait_on_invalidate(cookie); + case FSCACHE_COOKIE_STATE_FAILED: + if (atomic_read(&cookie->n_accesses) != 0) + break; + if (test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_RELINQUISHING); + wake = true; + goto again_locked; + } + if (test_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags)) { + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_WITHDRAWING); + wake = true; + goto again_locked; + } + break; - /* Dispose of the backing objects */ - set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + case FSCACHE_COOKIE_STATE_WITHDRAWING: + if (cookie->cache_priv) { + spin_unlock(&cookie->lock); + cookie->volume->cache->ops->withdraw_cookie(cookie); + spin_lock(&cookie->lock); + } - spin_lock(&cookie->lock); - if (!hlist_empty(&cookie->backing_objects)) { - hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { - if (invalidate) - set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + switch (state) { + case FSCACHE_COOKIE_STATE_RELINQUISHING: + fscache_see_cookie(cookie, fscache_cookie_see_relinquish); + fscache_unhash_cookie(cookie); + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_DROPPED); + wake = true; + goto out; + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + fscache_see_cookie(cookie, fscache_cookie_see_lru_discard); + break; + case FSCACHE_COOKIE_STATE_WITHDRAWING: + fscache_see_cookie(cookie, fscache_cookie_see_withdraw); + break; + default: + BUG(); } - } else { - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - awaken = true; - } - spin_unlock(&cookie->lock); - if (awaken) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); - /* Wait for cessation of activity requiring access to the netfs (when - * n_active reaches 0). This makes sure outstanding reads and writes - * have completed. - */ - if (!atomic_dec_and_test(&cookie->n_active)) { - wait_var_event(&cookie->n_active, - !atomic_read(&cookie->n_active)); - } + clear_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); + clear_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags); + clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); + clear_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); + wake = true; + goto again_locked; - /* Make sure any pending writes are cancelled. */ - if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) - fscache_invalidate_writes(cookie); + case FSCACHE_COOKIE_STATE_DROPPED: + break; - /* Reset the cookie state if it wasn't relinquished */ - if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { - atomic_inc(&cookie->n_active); - set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + default: + WARN_ONCE(1, "Cookie %x in unexpected state %u\n", + cookie->debug_id, state); + break; } -out_unlock_enable: - clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); +out: + spin_unlock(&cookie->lock); + if (wake) + wake_up_cookie_state(cookie); _leave(""); } -EXPORT_SYMBOL(__fscache_disable_cookie); + +static void fscache_cookie_worker(struct work_struct *work) +{ + struct fscache_cookie *cookie = container_of(work, struct fscache_cookie, work); + + fscache_see_cookie(cookie, fscache_cookie_see_work); + fscache_cookie_state_machine(cookie); + fscache_put_cookie(cookie, fscache_cookie_put_work); +} /* - * release a cookie back to the cache - * - the object will be marked as recyclable on disk if retire is true - * - all dependents of this cookie must have already been unregistered - * (indices/files/pages) + * Wait for the object to become inactive. The cookie's work item will be + * scheduled when someone transitions n_accesses to 0 - but if someone's + * already done that, schedule it anyway. */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool retire) +static void __fscache_withdraw_cookie(struct fscache_cookie *cookie) { - fscache_stat(&fscache_n_relinquishes); - if (retire) - fscache_stat(&fscache_n_relinquishes_retire); + int n_accesses; + bool unpinned; + + unpinned = test_and_clear_bit(FSCACHE_COOKIE_NO_ACCESS_WAKE, &cookie->flags); + + /* Need to read the access count after unpinning */ + n_accesses = atomic_read(&cookie->n_accesses); + if (unpinned) + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + n_accesses, fscache_access_cache_unpin); + if (n_accesses == 0) + fscache_queue_cookie(cookie, fscache_cookie_get_end_access); +} - if (!cookie) { - fscache_stat(&fscache_n_relinquishes_null); - _leave(" [no cookie]"); - return; - } +static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie) +{ + fscache_see_cookie(cookie, fscache_cookie_see_lru_do_one); - _enter("%x{%s,%d},%d", - cookie->debug_id, cookie->def->name, - atomic_read(&cookie->n_active), retire); + spin_lock(&cookie->lock); + if (cookie->state != FSCACHE_COOKIE_STATE_ACTIVE || + time_before(jiffies, cookie->unused_at + fscache_lru_cookie_timeout) || + atomic_read(&cookie->n_active) > 0) { + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_cookies_lru_removed); + } else { + set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_cookies_lru_expired); + _debug("lru c=%x", cookie->debug_id); + __fscache_withdraw_cookie(cookie); + } - trace_fscache_relinquish(cookie, retire); + fscache_put_cookie(cookie, fscache_cookie_put_lru); +} - /* No further netfs-accessing operations on this cookie permitted */ - if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) - BUG(); +static void fscache_cookie_lru_worker(struct work_struct *work) +{ + struct fscache_cookie *cookie; + unsigned long unused_at; - __fscache_disable_cookie(cookie, aux_data, retire); + spin_lock(&fscache_cookie_lru_lock); - /* Clear pointers back to the netfs */ - cookie->netfs_data = NULL; - cookie->def = NULL; - BUG_ON(!radix_tree_empty(&cookie->stores)); + while (!list_empty(&fscache_cookie_lru)) { + cookie = list_first_entry(&fscache_cookie_lru, + struct fscache_cookie, commit_link); + unused_at = cookie->unused_at + fscache_lru_cookie_timeout; + if (time_before(jiffies, unused_at)) { + timer_reduce(&fscache_cookie_lru_timer, unused_at); + break; + } - if (cookie->parent) { - ASSERTCMP(refcount_read(&cookie->parent->ref), >, 0); - ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); - atomic_dec(&cookie->parent->n_children); + list_del_init(&cookie->commit_link); + fscache_stat_d(&fscache_n_cookies_lru); + spin_unlock(&fscache_cookie_lru_lock); + fscache_cookie_lru_do_one(cookie); + spin_lock(&fscache_cookie_lru_lock); } - /* Dispose of the netfs's link to the cookie */ - fscache_cookie_put(cookie, fscache_cookie_put_relinquish); + spin_unlock(&fscache_cookie_lru_lock); +} - _leave(""); +static void fscache_cookie_lru_timed_out(struct timer_list *timer) +{ + queue_work(fscache_wq, &fscache_cookie_lru_work); +} + +static void fscache_cookie_drop_from_lru(struct fscache_cookie *cookie) +{ + bool need_put = false; + + if (!list_empty(&cookie->commit_link)) { + spin_lock(&fscache_cookie_lru_lock); + if (!list_empty(&cookie->commit_link)) { + list_del_init(&cookie->commit_link); + fscache_stat_d(&fscache_n_cookies_lru); + fscache_stat(&fscache_n_cookies_lru_dropped); + need_put = true; + } + spin_unlock(&fscache_cookie_lru_lock); + if (need_put) + fscache_put_cookie(cookie, fscache_cookie_put_lru); + } } -EXPORT_SYMBOL(__fscache_relinquish_cookie); /* * Remove a cookie from the hash table. @@ -851,43 +908,91 @@ static void fscache_unhash_cookie(struct fscache_cookie *cookie) hlist_bl_lock(h); hlist_bl_del(&cookie->hash_link); + clear_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags); hlist_bl_unlock(h); + fscache_stat(&fscache_n_relinquishes_dropped); } +static void fscache_drop_withdraw_cookie(struct fscache_cookie *cookie) +{ + fscache_cookie_drop_from_lru(cookie); + __fscache_withdraw_cookie(cookie); +} + +/** + * fscache_withdraw_cookie - Mark a cookie for withdrawal + * @cookie: The cookie to be withdrawn. + * + * Allow the cache backend to withdraw the backing for a cookie for its own + * reasons, even if that cookie is in active use. + */ +void fscache_withdraw_cookie(struct fscache_cookie *cookie) +{ + set_bit(FSCACHE_COOKIE_DO_WITHDRAW, &cookie->flags); + fscache_drop_withdraw_cookie(cookie); +} +EXPORT_SYMBOL(fscache_withdraw_cookie); + /* - * Drop a reference to a cookie. + * Allow the netfs to release a cookie back to the cache. + * - the object will be marked as recyclable on disk if retire is true */ -void fscache_cookie_put(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) { - struct fscache_cookie *parent; - int ref; + fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); + + _enter("c=%08x{%d},%d", + cookie->debug_id, atomic_read(&cookie->n_active), retire); - _enter("%x", cookie->debug_id); + if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), + "Cookie c=%x already relinquished\n", cookie->debug_id)) + return; - do { - unsigned int cookie_debug_id = cookie->debug_id; - bool zero = __refcount_dec_and_test(&cookie->ref, &ref); + if (retire) + set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + trace_fscache_relinquish(cookie, retire); - trace_fscache_cookie(cookie_debug_id, ref - 1, where); - if (!zero) - return; + ASSERTCMP(atomic_read(&cookie->n_active), ==, 0); + ASSERTCMP(atomic_read(&cookie->volume->n_cookies), >, 0); + atomic_dec(&cookie->volume->n_cookies); - parent = cookie->parent; + if (test_bit(FSCACHE_COOKIE_HAS_BEEN_CACHED, &cookie->flags)) { + set_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags); + fscache_drop_withdraw_cookie(cookie); + } else { + fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_DROPPED); fscache_unhash_cookie(cookie); - fscache_free_cookie(cookie); + } + fscache_put_cookie(cookie, fscache_cookie_put_relinquish); +} +EXPORT_SYMBOL(__fscache_relinquish_cookie); - cookie = parent; - where = fscache_cookie_put_parent; - } while (cookie); +/* + * Drop a reference to a cookie. + */ +void fscache_put_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + struct fscache_volume *volume = cookie->volume; + unsigned int cookie_debug_id = cookie->debug_id; + bool zero; + int ref; - _leave(""); + zero = __refcount_dec_and_test(&cookie->ref, &ref); + trace_fscache_cookie(cookie_debug_id, ref - 1, where); + if (zero) { + fscache_free_cookie(cookie); + fscache_put_volume(volume, fscache_volume_put_cookie); + } } +EXPORT_SYMBOL(fscache_put_cookie); /* * Get a reference to a cookie. */ -struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie, +struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where) { int ref; @@ -896,85 +1001,73 @@ struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie, trace_fscache_cookie(cookie->debug_id, ref + 1, where); return cookie; } +EXPORT_SYMBOL(fscache_get_cookie); /* - * check the consistency between the netfs inode and the backing cache - * - * NOTE: it only serves no-index type + * Ask the cache to effect invalidation of a cookie. */ -int __fscache_check_consistency(struct fscache_cookie *cookie, - const void *aux_data) +static void fscache_perform_invalidation(struct fscache_cookie *cookie) { - struct fscache_operation *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,", cookie); + if (!cookie->volume->cache->ops->invalidate_cookie(cookie)) + fscache_caching_failed(cookie); + fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end); +} - ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); +/* + * Invalidate an object. + */ +void __fscache_invalidate(struct fscache_cookie *cookie, + const void *aux_data, loff_t new_size, + unsigned int flags) +{ + bool is_caching; - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; + _enter("c=%x", cookie->debug_id); - if (hlist_empty(&cookie->backing_objects)) - return 0; + fscache_stat(&fscache_n_invalidates); - op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!op) - return -ENOMEM; + if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), + "Trying to invalidate relinquished cookie\n")) + return; - fscache_operation_init(cookie, op, NULL, NULL, NULL); - op->flags = FSCACHE_OP_MYTHREAD | - (1 << FSCACHE_OP_WAITING) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency); + if ((flags & FSCACHE_INVAL_DIO_WRITE) && + test_and_set_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags)) + return; spin_lock(&cookie->lock); + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + fscache_update_aux(cookie, aux_data, &new_size); + cookie->inval_counter++; + trace_fscache_invalidate(cookie, new_size); - fscache_update_aux(cookie, aux_data); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto inconsistent; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) - goto inconsistent; - - op->debug_id = atomic_inc_return(&fscache_op_debug_id); + switch (cookie->state) { + case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */ + default: + spin_unlock(&cookie->lock); + _leave(" [no %u]", cookie->state); + return; - __fscache_use_cookie(cookie); - if (fscache_submit_op(object, op) < 0) - goto submit_failed; + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_CREATING: + spin_unlock(&cookie->lock); + _leave(" [look %x]", cookie->inval_counter); + return; - /* the work queue now carries its own ref on the object */ - spin_unlock(&cookie->lock); + case FSCACHE_COOKIE_STATE_ACTIVE: + is_caching = fscache_begin_cookie_access( + cookie, fscache_access_invalidate_cookie); + if (is_caching) + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_INVALIDATING); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); - ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); - if (ret == 0) { - /* ask the cache to honour the operation */ - ret = object->cache->ops->check_consistency(op); - fscache_op_complete(op, false); - } else if (ret == -ENOBUFS) { - ret = 0; + if (is_caching) + fscache_queue_cookie(cookie, fscache_cookie_get_inval_work); + _leave(" [inv]"); + return; } - - fscache_put_operation(op); - _leave(" = %d", ret); - return ret; - -submit_failed: - wake_cookie = __fscache_unuse_cookie(cookie); -inconsistent: - spin_unlock(&cookie->lock); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - kfree(op); - _leave(" = -ESTALE"); - return -ESTALE; } -EXPORT_SYMBOL(__fscache_check_consistency); +EXPORT_SYMBOL(__fscache_invalidate); /* * Generate a list of extant cookies in /proc/fs/fscache/cookies @@ -983,44 +1076,27 @@ static int fscache_cookies_seq_show(struct seq_file *m, void *v) { struct fscache_cookie *cookie; unsigned int keylen = 0, auxlen = 0; - char _type[3], *type; u8 *p; if (v == &fscache_cookies) { seq_puts(m, - "COOKIE PARENT USAGE CHILD ACT TY FL DEF NETFS_DATA\n" - "======== ======== ===== ===== === == === ================ ==========\n" + "COOKIE VOLUME REF ACT ACC S FL DEF \n" + "======== ======== === === === = == ================\n" ); return 0; } cookie = list_entry(v, struct fscache_cookie, proc_link); - switch (cookie->type) { - case 0: - type = "IX"; - break; - case 1: - type = "DT"; - break; - default: - snprintf(_type, sizeof(_type), "%02u", - cookie->type); - type = _type; - break; - } - seq_printf(m, - "%08x %08x %5u %5u %3u %s %03lx %-16s %px", + "%08x %08x %3d %3d %3d %c %02lx", cookie->debug_id, - cookie->parent ? cookie->parent->debug_id : 0, + cookie->volume->debug_id, refcount_read(&cookie->ref), - atomic_read(&cookie->n_children), atomic_read(&cookie->n_active), - type, - cookie->flags, - cookie->def->name, - cookie->netfs_data); + atomic_read(&cookie->n_accesses), + fscache_cookie_states[cookie->state], + cookie->flags); keylen = cookie->key_len; auxlen = cookie->aux_len; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c deleted file mode 100644 index 0402673c680e..000000000000 --- a/fs/fscache/fsdef.c +++ /dev/null @@ -1,98 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Filesystem index definition - * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL CACHE -#include <linux/module.h> -#include "internal.h" - -static -enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size); - -/* - * The root index is owned by FS-Cache itself. - * - * When a netfs requests caching facilities, FS-Cache will, if one doesn't - * already exist, create an entry in the root index with the key being the name - * of the netfs ("AFS" for example), and the auxiliary data holding the index - * structure version supplied by the netfs: - * - * FSDEF - * | - * +-----------+ - * | | - * NFS AFS - * [v=1] [v=1] - * - * If an entry with the appropriate name does already exist, the version is - * compared. If the version is different, the entire subtree from that entry - * will be discarded and a new entry created. - * - * The new entry will be an index, and a cookie referring to it will be passed - * to the netfs. This is then the root handle by which the netfs accesses the - * cache. It can create whatever objects it likes in that index, including - * further indices. - */ -static struct fscache_cookie_def fscache_fsdef_index_def = { - .name = ".FS-Cache", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie fscache_fsdef_index = { - .debug_id = 1, - .ref = REFCOUNT_INIT(1), - .n_active = ATOMIC_INIT(1), - .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), - .backing_objects = HLIST_HEAD_INIT, - .def = &fscache_fsdef_index_def, - .flags = 1 << FSCACHE_COOKIE_ENABLED, - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; -EXPORT_SYMBOL(fscache_fsdef_index); - -/* - * Definition of an entry in the root index. Each entry is an index, keyed to - * a specific netfs and only applicable to a particular version of the index - * structure used by that netfs. - */ -struct fscache_cookie_def fscache_fsdef_netfs_def = { - .name = "FSDEF.netfs", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .check_aux = fscache_fsdef_netfs_check_aux, -}; - -/* - * check that the index structure version number stored in the auxiliary data - * matches the one the netfs gave us - */ -static enum fscache_checkaux fscache_fsdef_netfs_check_aux( - void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct fscache_netfs *netfs = cookie_netfs_data; - uint32_t version; - - _enter("{%s},,%hu", netfs->name, datalen); - - if (datalen != sizeof(version)) { - _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version)); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - memcpy(&version, data, sizeof(version)); - if (version != netfs->version) { - _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = OKAY"); - return FSCACHE_CHECKAUX_OKAY; -} diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index c3e4804b8fcb..f121c21590dc 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -1,65 +1,69 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal definitions for FS-Cache * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -/* - * Lock order, in the order in which multiple locks should be obtained: - * - fscache_addremove_sem - * - cookie->lock - * - cookie->parent->lock - * - cache->object_list_lock - * - object->lock - * - object->parent->lock - * - cookie->stores_lock - * - fscache_thread_lock - * - */ - #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) "FS-Cache: " fmt +#include <linux/slab.h> #include <linux/fscache-cache.h> #include <trace/events/fscache.h> #include <linux/sched.h> #include <linux/seq_file.h> -#define FSCACHE_MIN_THREADS 4 -#define FSCACHE_MAX_THREADS 32 - /* * cache.c */ -extern struct list_head fscache_cache_list; -extern struct rw_semaphore fscache_addremove_sem; +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_caches_seq_ops; +#endif +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); +void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); + +static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) +{ + return smp_load_acquire(&cache->state); +} + +static inline bool fscache_cache_is_live(const struct fscache_cache *cache) +{ + return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; +} -extern struct fscache_cache *fscache_select_cache_for_object( - struct fscache_cookie *); +static inline void fscache_set_cache_state(struct fscache_cache *cache, + enum fscache_cache_state new_state) +{ + smp_store_release(&cache->state, new_state); + +} + +static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, + enum fscache_cache_state old_state, + enum fscache_cache_state new_state) +{ + return try_cmpxchg_release(&cache->state, &old_state, new_state); +} /* * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; extern const struct seq_operations fscache_cookies_seq_ops; +extern struct timer_list fscache_cookie_lru_timer; + +extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); +extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); -extern void fscache_free_cookie(struct fscache_cookie *); -extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, - const struct fscache_cookie_def *, - const void *, size_t, - const void *, size_t, - void *, loff_t); -extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *); -extern struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *, - enum fscache_cookie_trace); -extern void fscache_cookie_put(struct fscache_cookie *, - enum fscache_cookie_trace); - -static inline void fscache_cookie_see(struct fscache_cookie *cookie, +static inline void fscache_see_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where) { trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), @@ -67,60 +71,22 @@ static inline void fscache_cookie_see(struct fscache_cookie *cookie, } /* - * fsdef.c + * io.c */ -extern struct fscache_cookie fscache_fsdef_index; -extern struct fscache_cookie_def fscache_fsdef_netfs_def; - -/* - * main.c - */ -extern unsigned fscache_defer_lookup; -extern unsigned fscache_defer_create; -extern unsigned fscache_debug; -extern struct kobject *fscache_root; -extern struct workqueue_struct *fscache_object_wq; -extern struct workqueue_struct *fscache_op_wq; -DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); - -extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); - -static inline bool fscache_object_congested(void) +static inline void fscache_end_operation(struct netfs_cache_resources *cres) { - return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + + if (ops) + ops->end_operation(cres); } /* - * object.c + * main.c */ -extern void fscache_enqueue_object(struct fscache_object *); +extern unsigned fscache_debug; -/* - * operation.c - */ -extern int fscache_submit_exclusive_op(struct fscache_object *, - struct fscache_operation *); -extern int fscache_submit_op(struct fscache_object *, - struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, bool); -extern void fscache_cancel_all_ops(struct fscache_object *); -extern void fscache_abort_object(struct fscache_object *); -extern void fscache_start_operations(struct fscache_object *); -extern void fscache_operation_gc(struct work_struct *); - -/* - * page.c - */ -extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); -extern int fscache_wait_for_operation_activation(struct fscache_object *, - struct fscache_operation *, - atomic_t *, - atomic_t *); -extern void fscache_invalidate_writes(struct fscache_cookie *); -struct fscache_retrieval *fscache_alloc_retrieval(struct fscache_cookie *cookie, - struct address_space *mapping, - fscache_rw_complete_t end_io_func, - void *context); +extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); /* * proc.c @@ -137,125 +103,27 @@ extern void fscache_proc_cleanup(void); * stats.c */ #ifdef CONFIG_FSCACHE_STATS -extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; -extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS]; - -extern atomic_t fscache_n_op_pend; -extern atomic_t fscache_n_op_run; -extern atomic_t fscache_n_op_enqueue; -extern atomic_t fscache_n_op_deferred_release; -extern atomic_t fscache_n_op_initialised; -extern atomic_t fscache_n_op_release; -extern atomic_t fscache_n_op_gc; -extern atomic_t fscache_n_op_cancelled; -extern atomic_t fscache_n_op_rejected; - -extern atomic_t fscache_n_attr_changed; -extern atomic_t fscache_n_attr_changed_ok; -extern atomic_t fscache_n_attr_changed_nobufs; -extern atomic_t fscache_n_attr_changed_nomem; -extern atomic_t fscache_n_attr_changed_calls; - -extern atomic_t fscache_n_allocs; -extern atomic_t fscache_n_allocs_ok; -extern atomic_t fscache_n_allocs_wait; -extern atomic_t fscache_n_allocs_nobufs; -extern atomic_t fscache_n_allocs_intr; -extern atomic_t fscache_n_allocs_object_dead; -extern atomic_t fscache_n_alloc_ops; -extern atomic_t fscache_n_alloc_op_waits; - -extern atomic_t fscache_n_retrievals; -extern atomic_t fscache_n_retrievals_ok; -extern atomic_t fscache_n_retrievals_wait; -extern atomic_t fscache_n_retrievals_nodata; -extern atomic_t fscache_n_retrievals_nobufs; -extern atomic_t fscache_n_retrievals_intr; -extern atomic_t fscache_n_retrievals_nomem; -extern atomic_t fscache_n_retrievals_object_dead; -extern atomic_t fscache_n_retrieval_ops; -extern atomic_t fscache_n_retrieval_op_waits; - -extern atomic_t fscache_n_stores; -extern atomic_t fscache_n_stores_ok; -extern atomic_t fscache_n_stores_again; -extern atomic_t fscache_n_stores_nobufs; -extern atomic_t fscache_n_stores_oom; -extern atomic_t fscache_n_store_ops; -extern atomic_t fscache_n_store_calls; -extern atomic_t fscache_n_store_pages; -extern atomic_t fscache_n_store_radix_deletes; -extern atomic_t fscache_n_store_pages_over_limit; - -extern atomic_t fscache_n_store_vmscan_not_storing; -extern atomic_t fscache_n_store_vmscan_gone; -extern atomic_t fscache_n_store_vmscan_busy; -extern atomic_t fscache_n_store_vmscan_cancelled; -extern atomic_t fscache_n_store_vmscan_wait; - -extern atomic_t fscache_n_marks; -extern atomic_t fscache_n_uncaches; +extern atomic_t fscache_n_volumes; +extern atomic_t fscache_n_volumes_collision; +extern atomic_t fscache_n_volumes_nomem; +extern atomic_t fscache_n_cookies; +extern atomic_t fscache_n_cookies_lru; +extern atomic_t fscache_n_cookies_lru_expired; +extern atomic_t fscache_n_cookies_lru_removed; +extern atomic_t fscache_n_cookies_lru_dropped; extern atomic_t fscache_n_acquires; -extern atomic_t fscache_n_acquires_null; -extern atomic_t fscache_n_acquires_no_cache; extern atomic_t fscache_n_acquires_ok; -extern atomic_t fscache_n_acquires_nobufs; extern atomic_t fscache_n_acquires_oom; extern atomic_t fscache_n_invalidates; -extern atomic_t fscache_n_invalidates_run; - -extern atomic_t fscache_n_updates; -extern atomic_t fscache_n_updates_null; -extern atomic_t fscache_n_updates_run; extern atomic_t fscache_n_relinquishes; -extern atomic_t fscache_n_relinquishes_null; -extern atomic_t fscache_n_relinquishes_waitcrt; extern atomic_t fscache_n_relinquishes_retire; +extern atomic_t fscache_n_relinquishes_dropped; -extern atomic_t fscache_n_cookie_index; -extern atomic_t fscache_n_cookie_data; -extern atomic_t fscache_n_cookie_special; - -extern atomic_t fscache_n_object_alloc; -extern atomic_t fscache_n_object_no_alloc; -extern atomic_t fscache_n_object_lookups; -extern atomic_t fscache_n_object_lookups_negative; -extern atomic_t fscache_n_object_lookups_positive; -extern atomic_t fscache_n_object_lookups_timed_out; -extern atomic_t fscache_n_object_created; -extern atomic_t fscache_n_object_avail; -extern atomic_t fscache_n_object_dead; - -extern atomic_t fscache_n_checkaux_none; -extern atomic_t fscache_n_checkaux_okay; -extern atomic_t fscache_n_checkaux_update; -extern atomic_t fscache_n_checkaux_obsolete; - -extern atomic_t fscache_n_cop_alloc_object; -extern atomic_t fscache_n_cop_lookup_object; -extern atomic_t fscache_n_cop_lookup_complete; -extern atomic_t fscache_n_cop_grab_object; -extern atomic_t fscache_n_cop_invalidate_object; -extern atomic_t fscache_n_cop_update_object; -extern atomic_t fscache_n_cop_drop_object; -extern atomic_t fscache_n_cop_put_object; -extern atomic_t fscache_n_cop_sync_cache; -extern atomic_t fscache_n_cop_attr_changed; -extern atomic_t fscache_n_cop_read_or_alloc_page; -extern atomic_t fscache_n_cop_read_or_alloc_pages; -extern atomic_t fscache_n_cop_allocate_page; -extern atomic_t fscache_n_cop_allocate_pages; -extern atomic_t fscache_n_cop_write_page; -extern atomic_t fscache_n_cop_uncache_page; -extern atomic_t fscache_n_cop_dissociate_pages; - -extern atomic_t fscache_n_cache_no_space_reject; -extern atomic_t fscache_n_cache_stale_objects; -extern atomic_t fscache_n_cache_retired_objects; -extern atomic_t fscache_n_cache_culled_objects; +extern atomic_t fscache_n_resizes; +extern atomic_t fscache_n_resizes_null; static inline void fscache_stat(atomic_t *stat) { @@ -278,71 +146,26 @@ int fscache_stats_show(struct seq_file *m, void *v); #endif /* - * raise an event on an object - * - if the event is not masked for that object, then the object is - * queued for attention by the thread pool. - */ -static inline void fscache_raise_event(struct fscache_object *object, - unsigned event) -{ - BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); -#if 0 - printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n", - object->debug_id, object->event_mask, (1 << event)); -#endif - if (!test_and_set_bit(event, &object->events) && - test_bit(event, &object->event_mask)) - fscache_enqueue_object(object); -} - -/* - * get an extra reference to a netfs retrieval context + * volume.c */ -static inline -void *fscache_get_context(struct fscache_cookie *cookie, void *context) -{ - if (cookie->def->get_context) - cookie->def->get_context(cookie->netfs_data, context); - return context; -} +extern const struct seq_operations fscache_volumes_seq_ops; -/* - * release a reference to a netfs retrieval context - */ -static inline -void fscache_put_context(struct fscache_cookie *cookie, void *context) -{ - if (cookie->def->put_context) - cookie->def->put_context(cookie->netfs_data, context); -} +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); +void fscache_create_volume(struct fscache_volume *volume, bool wait); -/* - * Update the auxiliary data on a cookie. - */ -static inline -void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data) -{ - void *p; - - if (!aux_data) - return; - if (cookie->aux_len <= sizeof(cookie->inline_aux)) - p = cookie->inline_aux; - else - p = cookie->aux; - - if (memcmp(p, aux_data, cookie->aux_len) != 0) { - memcpy(p, aux_data, cookie->aux_len); - set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags); - } -} /*****************************************************************************/ /* * debug tracing */ #define dbgprintk(FMT, ...) \ - printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) @@ -395,7 +218,7 @@ do { \ #define FSCACHE_DEBUG_CACHE 0 #define FSCACHE_DEBUG_COOKIE 1 -#define FSCACHE_DEBUG_PAGE 2 +#define FSCACHE_DEBUG_OBJECT 2 #define FSCACHE_DEBUG_OPERATION 3 #define FSCACHE_POINT_ENTER 1 diff --git a/fs/fscache/io.c b/fs/fscache/io.c index 8ecc1141802f..7a769ea57720 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -4,113 +4,323 @@ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ - -#define FSCACHE_DEBUG_LEVEL PAGE -#include <linux/module.h> -#define FSCACHE_USE_NEW_IO_API +#define FSCACHE_DEBUG_LEVEL OPERATION #include <linux/fscache-cache.h> +#include <linux/uio.h> +#include <linux/bvec.h> #include <linux/slab.h> -#include <linux/netfs.h> +#include <linux/uio.h> #include "internal.h" -/* - * Start a cache read operation. - * - we return: - * -ENOMEM - out of memory, some pages may be being read - * -ERESTARTSYS - interrupted, some pages may be being read - * -ENOBUFS - no backing object or space available in which to cache any - * pages not being read - * -ENODATA - no data available in the backing object for some or all of - * the pages - * 0 - dispatched a read on all pages +/** + * fscache_wait_for_operation - Wait for an object become accessible + * @cres: The cache resources for the operation being performed + * @want_state: The minimum state the object must be at + * + * See if the target cache object is at the specified minimum state of + * accessibility yet, and if not, wait for it. */ -int __fscache_begin_read_operation(struct netfs_read_request *rreq, - struct fscache_cookie *cookie) +bool fscache_wait_for_operation(struct netfs_cache_resources *cres, + enum fscache_want_state want_state) { - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; + struct fscache_cookie *cookie = fscache_cres_cookie(cres); + enum fscache_cookie_state state; - _enter("rr=%08x", rreq->debug_id); +again: + if (!fscache_cache_is_live(cookie->volume->cache)) { + _leave(" [broken]"); + return false; + } - fscache_stat(&fscache_n_retrievals); + state = fscache_cookie_state(cookie); + _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; + switch (state) { + case FSCACHE_COOKIE_STATE_CREATING: + case FSCACHE_COOKIE_STATE_INVALIDATING: + if (want_state == FSCACHE_WANT_PARAMS) + goto ready; /* There can be no content */ + fallthrough; + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + wait_var_event(&cookie->state, + fscache_cookie_state(cookie) != state); + goto again; - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; + case FSCACHE_COOKIE_STATE_ACTIVE: + goto ready; + case FSCACHE_COOKIE_STATE_DROPPED: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + default: + _leave(" [not live]"); + return false; } - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); +ready: + if (!cres->cache_priv2) + return cookie->volume->cache->ops->begin_operation(cres, want_state); + return true; +} +EXPORT_SYMBOL(fscache_wait_for_operation); + +/* + * Begin an I/O operation on the cache, waiting till we reach the right state. + * + * Attaches the resources required to the operation resources record. + */ +static int fscache_begin_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie, + enum fscache_want_state want_state, + enum fscache_access_trace why) +{ + enum fscache_cookie_state state; + long timeo; + bool once_only = false; - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; + cres->ops = NULL; + cres->cache_priv = cookie; + cres->cache_priv2 = NULL; + cres->debug_id = cookie->debug_id; + cres->inval_counter = cookie->inval_counter; - op = fscache_alloc_retrieval(cookie, NULL, NULL, NULL); - if (!op) - return -ENOMEM; - trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); + if (!fscache_begin_cookie_access(cookie, why)) + return -ENOBUFS; +again: spin_lock(&cookie->lock); - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); + state = fscache_cookie_state(cookie); + _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); - __fscache_use_cookie(cookie); - atomic_inc(&object->n_reads); - __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + switch (state) { + case FSCACHE_COOKIE_STATE_LOOKING_UP: + case FSCACHE_COOKIE_STATE_LRU_DISCARDING: + case FSCACHE_COOKIE_STATE_INVALIDATING: + goto wait_for_file_wrangling; + case FSCACHE_COOKIE_STATE_CREATING: + if (want_state == FSCACHE_WANT_PARAMS) + goto ready; /* There can be no content */ + goto wait_for_file_wrangling; + case FSCACHE_COOKIE_STATE_ACTIVE: + goto ready; + case FSCACHE_COOKIE_STATE_DROPPED: + case FSCACHE_COOKIE_STATE_RELINQUISHING: + WARN(1, "Can't use cookie in state %u\n", cookie->state); + goto not_live; + default: + goto not_live; + } - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; +ready: spin_unlock(&cookie->lock); + if (!cookie->volume->cache->ops->begin_operation(cres, want_state)) + goto failed; + return 0; - fscache_stat(&fscache_n_retrieval_ops); +wait_for_file_wrangling: + spin_unlock(&cookie->lock); + trace_fscache_access(cookie->debug_id, refcount_read(&cookie->ref), + atomic_read(&cookie->n_accesses), + fscache_access_io_wait); + timeo = wait_var_event_timeout(&cookie->state, + fscache_cookie_state(cookie) != state, 20 * HZ); + if (timeo <= 1 && !once_only) { + pr_warn("%s: cookie state change wait timed out: cookie->state=%u state=%u", + __func__, fscache_cookie_state(cookie), state); + fscache_print_cookie(cookie, 'O'); + once_only = true; + } + goto again; - /* we wait for the operation to become active, and then process it - * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - ret = object->cache->ops->begin_read_operation(rreq, op); - -error: - if (ret == -ENOMEM) - fscache_stat(&fscache_n_retrievals_nomem); - else if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_retrievals_intr); - else if (ret == -ENODATA) - fscache_stat(&fscache_n_retrievals_nodata); - else if (ret < 0) - fscache_stat(&fscache_n_retrievals_nobufs); - else - fscache_stat(&fscache_n_retrievals_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - atomic_dec(&object->n_reads); - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: +not_live: spin_unlock(&cookie->lock); - fscache_put_retrieval(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); -nobufs: - fscache_stat(&fscache_n_retrievals_nobufs); +failed: + cres->cache_priv = NULL; + cres->ops = NULL; + fscache_end_cookie_access(cookie, fscache_access_io_not_live); _leave(" = -ENOBUFS"); return -ENOBUFS; } + +int __fscache_begin_read_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie) +{ + return fscache_begin_operation(cres, cookie, FSCACHE_WANT_PARAMS, + fscache_access_io_read); +} EXPORT_SYMBOL(__fscache_begin_read_operation); + +int __fscache_begin_write_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie) +{ + return fscache_begin_operation(cres, cookie, FSCACHE_WANT_PARAMS, + fscache_access_io_write); +} +EXPORT_SYMBOL(__fscache_begin_write_operation); + +/** + * fscache_set_page_dirty - Mark page dirty and pin a cache object for writeback + * @page: The page being dirtied + * @cookie: The cookie referring to the cache object + * + * Set the dirty flag on a page and pin an in-use cache object in memory when + * dirtying a page so that writeback can later write to it. This is intended + * to be called from the filesystem's ->set_page_dirty() method. + * + * Returns 1 if PG_dirty was set on the page, 0 otherwise. + */ +int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie) +{ + struct inode *inode = page->mapping->host; + bool need_use = false; + + _enter(""); + + if (!__set_page_dirty_nobuffers(page)) + return 0; + if (!fscache_cookie_valid(cookie)) + return 1; + + if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { + inode->i_state |= I_PINNING_FSCACHE_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(cookie, true); + } + return 1; +} +EXPORT_SYMBOL(fscache_set_page_dirty); + +struct fscache_write_request { + struct netfs_cache_resources cache_resources; + struct address_space *mapping; + loff_t start; + size_t len; + bool set_bits; + netfs_io_terminated_t term_func; + void *term_func_priv; +}; + +void __fscache_clear_page_bits(struct address_space *mapping, + loff_t start, size_t len) +{ + pgoff_t first = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE; + struct page *page; + + if (len) { + XA_STATE(xas, &mapping->i_pages, first); + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + end_page_fscache(page); + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(__fscache_clear_page_bits); + +/* + * Deal with the completion of writing the data to the cache. + */ +static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct fscache_write_request *wreq = priv; + + fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources), + wreq->mapping, wreq->start, wreq->len, + wreq->set_bits); + + if (wreq->term_func) + wreq->term_func(wreq->term_func_priv, transferred_or_error, + was_async); + fscache_end_operation(&wreq->cache_resources); + kfree(wreq); +} + +void __fscache_write_to_cache(struct fscache_cookie *cookie, + struct address_space *mapping, + loff_t start, size_t len, loff_t i_size, + netfs_io_terminated_t term_func, + void *term_func_priv, + bool cond) +{ + struct fscache_write_request *wreq; + struct netfs_cache_resources *cres; + struct iov_iter iter; + int ret = -ENOBUFS; + + if (len == 0) + goto abandon; + + _enter("%llx,%zx", start, len); + + wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS); + if (!wreq) + goto abandon; + wreq->mapping = mapping; + wreq->start = start; + wreq->len = len; + wreq->set_bits = cond; + wreq->term_func = term_func; + wreq->term_func_priv = term_func_priv; + + cres = &wreq->cache_resources; + if (fscache_begin_operation(cres, cookie, FSCACHE_WANT_WRITE, + fscache_access_io_write) < 0) + goto abandon_free; + + ret = cres->ops->prepare_write(cres, &start, &len, i_size, false); + if (ret < 0) + goto abandon_end; + + /* TODO: Consider clearing page bits now for space the write isn't + * covering. This is more complicated than it appears when THPs are + * taken into account. + */ + + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + fscache_write(cres, start, &iter, fscache_wreq_done, wreq); + return; + +abandon_end: + return fscache_wreq_done(wreq, ret, false); +abandon_free: + kfree(wreq); +abandon: + fscache_clear_page_bits(cookie, mapping, start, len, cond); + if (term_func) + term_func(term_func_priv, ret, false); +} +EXPORT_SYMBOL(__fscache_write_to_cache); + +/* + * Change the size of a backing object. + */ +void __fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size) +{ + struct netfs_cache_resources cres; + + trace_fscache_resize(cookie, new_size); + if (fscache_begin_operation(&cres, cookie, FSCACHE_WANT_WRITE, + fscache_access_io_resize) == 0) { + fscache_stat(&fscache_n_resizes); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); + + /* We cannot defer a resize as we need to do it inside the + * netfs's inode lock so that we're serialised with respect to + * writes. + */ + cookie->volume->cache->ops->resize_cookie(&cres, new_size); + fscache_end_operation(&cres); + } else { + fscache_stat(&fscache_n_resizes_null); + } +} +EXPORT_SYMBOL(__fscache_resize_cookie); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 4207f98e405f..dad85fd84f6f 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -1,17 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* General filesystem local caching manager * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/slab.h> -#include <linux/seq_file.h> #define CREATE_TRACE_POINTS #include "internal.h" @@ -19,79 +15,18 @@ MODULE_DESCRIPTION("FS Cache Manager"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); -unsigned fscache_defer_lookup = 1; -module_param_named(defer_lookup, fscache_defer_lookup, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_defer_lookup, - "Defer cookie lookup to background thread"); - -unsigned fscache_defer_create = 1; -module_param_named(defer_create, fscache_defer_create, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_defer_create, - "Defer cookie creation to background thread"); - unsigned fscache_debug; module_param_named(debug, fscache_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(fscache_debug, "FS-Cache debugging mask"); -struct kobject *fscache_root; -struct workqueue_struct *fscache_object_wq; -struct workqueue_struct *fscache_op_wq; - -DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); +EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); +EXPORT_TRACEPOINT_SYMBOL(fscache_access); -/* these values serve as lower bounds, will be adjusted in fscache_init() */ -static unsigned fscache_object_max_active = 4; -static unsigned fscache_op_max_active = 2; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *fscache_sysctl_header; - -static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct workqueue_struct **wqp = table->extra1; - unsigned int *datap = table->data; - int ret; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret == 0) - workqueue_set_max_active(*wqp, *datap); - return ret; -} - -static struct ctl_table fscache_sysctls[] = { - { - .procname = "object_max_active", - .data = &fscache_object_max_active, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = fscache_max_active_sysctl, - .extra1 = &fscache_object_wq, - }, - { - .procname = "operation_max_active", - .data = &fscache_op_max_active, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = fscache_max_active_sysctl, - .extra1 = &fscache_op_wq, - }, - {} -}; - -static struct ctl_table fscache_sysctls_root[] = { - { - .procname = "fscache", - .mode = 0555, - .child = fscache_sysctls, - }, - {} -}; -#endif +struct workqueue_struct *fscache_wq; +EXPORT_SYMBOL(fscache_wq); /* * Mixing scores (in bits) for (7,20): @@ -118,15 +53,16 @@ static inline unsigned int fold_hash(unsigned long x, unsigned long y) /* * Generate a hash. This is derived from full_name_hash(), but we want to be * sure it is arch independent and that it doesn't change as bits of the - * computed hash value might appear on disk. The caller also guarantees that - * the hashed data will be a series of aligned 32-bit words. + * computed hash value might appear on disk. The caller must guarantee that + * the source data is a multiple of four bytes in size. */ -unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) +unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) { - unsigned int a, x = 0, y = salt; + const __le32 *p = data; + unsigned int a, x = 0, y = salt, n = len / sizeof(__le32); for (; n; n--) { - a = *data++; + a = le32_to_cpu(*p++); HASH_MIX(x, y, a); } return fold_hash(x, y); @@ -137,44 +73,16 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) */ static int __init fscache_init(void) { - unsigned int nr_cpus = num_possible_cpus(); - unsigned int cpu; - int ret; - - fscache_object_max_active = - clamp_val(nr_cpus, - fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); - - ret = -ENOMEM; - fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, - fscache_object_max_active); - if (!fscache_object_wq) - goto error_object_wq; - - fscache_op_max_active = - clamp_val(fscache_object_max_active / 2, - fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + int ret = -ENOMEM; - ret = -ENOMEM; - fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, - fscache_op_max_active); - if (!fscache_op_wq) - goto error_op_wq; - - for_each_possible_cpu(cpu) - init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); + fscache_wq = alloc_workqueue("fscache", WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!fscache_wq) + goto error_wq; ret = fscache_proc_init(); if (ret < 0) goto error_proc; -#ifdef CONFIG_SYSCTL - ret = -ENOMEM; - fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); - if (!fscache_sysctl_header) - goto error_sysctl; -#endif - fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), 0, 0, NULL); @@ -184,26 +92,14 @@ static int __init fscache_init(void) goto error_cookie_jar; } - fscache_root = kobject_create_and_add("fscache", kernel_kobj); - if (!fscache_root) - goto error_kobj; - pr_notice("Loaded\n"); return 0; -error_kobj: - kmem_cache_destroy(fscache_cookie_jar); error_cookie_jar: -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(fscache_sysctl_header); -error_sysctl: -#endif fscache_proc_cleanup(); error_proc: - destroy_workqueue(fscache_op_wq); -error_op_wq: - destroy_workqueue(fscache_object_wq); -error_object_wq: + destroy_workqueue(fscache_wq); +error_wq: return ret; } @@ -216,14 +112,9 @@ static void __exit fscache_exit(void) { _enter(""); - kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(fscache_sysctl_header); -#endif fscache_proc_cleanup(); - destroy_workqueue(fscache_op_wq); - destroy_workqueue(fscache_object_wq); + destroy_workqueue(fscache_wq); pr_notice("Unloaded\n"); } diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c deleted file mode 100644 index d6bdb7b5e723..000000000000 --- a/fs/fscache/netfs.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache netfs (client) registration - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL COOKIE -#include <linux/module.h> -#include <linux/slab.h> -#include "internal.h" - -/* - * register a network filesystem for caching - */ -int __fscache_register_netfs(struct fscache_netfs *netfs) -{ - struct fscache_cookie *candidate, *cookie; - - _enter("{%s}", netfs->name); - - /* allocate a cookie for the primary index */ - candidate = fscache_alloc_cookie(&fscache_fsdef_index, - &fscache_fsdef_netfs_def, - netfs->name, strlen(netfs->name), - &netfs->version, sizeof(netfs->version), - netfs, 0); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - candidate->flags = 1 << FSCACHE_COOKIE_ENABLED; - - /* check the netfs type is not already present */ - cookie = fscache_hash_cookie(candidate); - if (!cookie) - goto already_registered; - if (cookie != candidate) { - trace_fscache_cookie(candidate->debug_id, 1, fscache_cookie_discard); - fscache_free_cookie(candidate); - } - - fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs); - atomic_inc(&cookie->parent->n_children); - - netfs->primary_index = cookie; - - pr_notice("Netfs '%s' registered for caching\n", netfs->name); - trace_fscache_netfs(netfs); - _leave(" = 0"); - return 0; - -already_registered: - fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs); - _leave(" = -EEXIST"); - return -EEXIST; -} -EXPORT_SYMBOL(__fscache_register_netfs); - -/* - * unregister a network filesystem from the cache - * - all cookies must have been released first - */ -void __fscache_unregister_netfs(struct fscache_netfs *netfs) -{ - _enter("{%s.%u}", netfs->name, netfs->version); - - fscache_relinquish_cookie(netfs->primary_index, NULL, false); - pr_notice("Netfs '%s' unregistered from caching\n", netfs->name); - - _leave(""); -} -EXPORT_SYMBOL(__fscache_unregister_netfs); diff --git a/fs/fscache/object.c b/fs/fscache/object.c deleted file mode 100644 index 6a675652129b..000000000000 --- a/fs/fscache/object.c +++ /dev/null @@ -1,1125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache object state machine handler - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * See Documentation/filesystems/caching/object.rst for a description of the - * object state machine and the in-kernel representations. - */ - -#define FSCACHE_DEBUG_LEVEL COOKIE -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/prefetch.h> -#include "internal.h" - -static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int); -static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int); -static const struct fscache_state *fscache_drop_object(struct fscache_object *, int); -static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int); -static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int); -static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int); -static const struct fscache_state *fscache_kill_object(struct fscache_object *, int); -static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int); -static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int); -static const struct fscache_state *fscache_object_available(struct fscache_object *, int); -static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); -static const struct fscache_state *fscache_update_object(struct fscache_object *, int); -static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); - -#define __STATE_NAME(n) fscache_osm_##n -#define STATE(n) (&__STATE_NAME(n)) - -/* - * Define a work state. Work states are execution states. No event processing - * is performed by them. The function attached to a work state returns a - * pointer indicating the next state to which the state machine should - * transition. Returning NO_TRANSIT repeats the current state, but goes back - * to the scheduler first. - */ -#define WORK_STATE(n, sn, f) \ - const struct fscache_state __STATE_NAME(n) = { \ - .name = #n, \ - .short_name = sn, \ - .work = f \ - } - -/* - * Returns from work states. - */ -#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); }) - -#define NO_TRANSIT ((struct fscache_state *)NULL) - -/* - * Define a wait state. Wait states are event processing states. No execution - * is performed by them. Wait states are just tables of "if event X occurs, - * clear it and transition to state Y". The dispatcher returns to the - * scheduler if none of the events in which the wait state has an interest are - * currently pending. - */ -#define WAIT_STATE(n, sn, ...) \ - const struct fscache_state __STATE_NAME(n) = { \ - .name = #n, \ - .short_name = sn, \ - .work = NULL, \ - .transitions = { __VA_ARGS__, { 0, NULL } } \ - } - -#define TRANSIT_TO(state, emask) \ - { .events = (emask), .transit_to = STATE(state) } - -/* - * The object state machine. - */ -static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); -static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); -static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); -static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); -static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); -static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); - -static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object); -static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object); - -static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); -static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); -static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); -static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); - -static WAIT_STATE(WAIT_FOR_INIT, "?INI", - TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); - -static WAIT_STATE(WAIT_FOR_PARENT, "?PRN", - TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY)); - -static WAIT_STATE(WAIT_FOR_CMD, "?CMD", - TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE), - TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE), - TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); - -static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR", - TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED)); - -/* - * Out-of-band event transition tables. These are for handling unexpected - * events, such as an I/O error. If an OOB event occurs, the state machine - * clears and disables the event and forces a transition to the nominated work - * state (acurrently executing work states will complete first). - * - * In such a situation, object->state remembers the state the machine should - * have been in/gone to and returning NO_TRANSIT returns to that. - */ -static const struct fscache_transition fscache_osm_init_oob[] = { - TRANSIT_TO(ABORT_INIT, - (1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_KILL)), - { 0, NULL } -}; - -static const struct fscache_transition fscache_osm_lookup_oob[] = { - TRANSIT_TO(LOOKUP_FAILURE, - (1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_KILL)), - { 0, NULL } -}; - -static const struct fscache_transition fscache_osm_run_oob[] = { - TRANSIT_TO(KILL_OBJECT, - (1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_KILL)), - { 0, NULL } -}; - -static int fscache_get_object(struct fscache_object *, - enum fscache_obj_ref_trace); -static void fscache_put_object(struct fscache_object *, - enum fscache_obj_ref_trace); -static bool fscache_enqueue_dependents(struct fscache_object *, int); -static void fscache_dequeue_object(struct fscache_object *); -static void fscache_update_aux_data(struct fscache_object *); - -/* - * we need to notify the parent when an op completes that we had outstanding - * upon it - */ -static inline void fscache_done_parent_op(struct fscache_object *object) -{ - struct fscache_object *parent = object->parent; - - _enter("OBJ%x {OBJ%x,%x}", - object->debug_id, parent->debug_id, parent->n_ops); - - spin_lock_nested(&parent->lock, 1); - parent->n_obj_ops--; - parent->n_ops--; - if (parent->n_ops == 0) - fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); - spin_unlock(&parent->lock); -} - -/* - * Object state machine dispatcher. - */ -static void fscache_object_sm_dispatcher(struct fscache_object *object) -{ - const struct fscache_transition *t; - const struct fscache_state *state, *new_state; - unsigned long events, event_mask; - bool oob; - int event = -1; - - ASSERT(object != NULL); - - _enter("{OBJ%x,%s,%lx}", - object->debug_id, object->state->name, object->events); - - event_mask = object->event_mask; -restart: - object->event_mask = 0; /* Mask normal event handling */ - state = object->state; -restart_masked: - events = object->events; - - /* Handle any out-of-band events (typically an error) */ - if (events & object->oob_event_mask) { - _debug("{OBJ%x} oob %lx", - object->debug_id, events & object->oob_event_mask); - oob = true; - for (t = object->oob_table; t->events; t++) { - if (events & t->events) { - state = t->transit_to; - ASSERT(state->work != NULL); - event = fls(events & t->events) - 1; - __clear_bit(event, &object->oob_event_mask); - clear_bit(event, &object->events); - goto execute_work_state; - } - } - } - oob = false; - - /* Wait states are just transition tables */ - if (!state->work) { - if (events & event_mask) { - for (t = state->transitions; t->events; t++) { - if (events & t->events) { - new_state = t->transit_to; - event = fls(events & t->events) - 1; - trace_fscache_osm(object, state, - true, false, event); - clear_bit(event, &object->events); - _debug("{OBJ%x} ev %d: %s -> %s", - object->debug_id, event, - state->name, new_state->name); - object->state = state = new_state; - goto execute_work_state; - } - } - - /* The event mask didn't include all the tabled bits */ - BUG(); - } - /* Randomly woke up */ - goto unmask_events; - } - -execute_work_state: - _debug("{OBJ%x} exec %s", object->debug_id, state->name); - - trace_fscache_osm(object, state, false, oob, event); - new_state = state->work(object, event); - event = -1; - if (new_state == NO_TRANSIT) { - _debug("{OBJ%x} %s notrans", object->debug_id, state->name); - if (unlikely(state == STATE(OBJECT_DEAD))) { - _leave(" [dead]"); - return; - } - fscache_enqueue_object(object); - event_mask = object->oob_event_mask; - goto unmask_events; - } - - _debug("{OBJ%x} %s -> %s", - object->debug_id, state->name, new_state->name); - object->state = state = new_state; - - if (state->work) { - if (unlikely(state == STATE(OBJECT_DEAD))) { - _leave(" [dead]"); - return; - } - goto restart_masked; - } - - /* Transited to wait state */ - event_mask = object->oob_event_mask; - for (t = state->transitions; t->events; t++) - event_mask |= t->events; - -unmask_events: - object->event_mask = event_mask; - smp_mb(); - events = object->events; - if (events & event_mask) - goto restart; - _leave(" [msk %lx]", event_mask); -} - -/* - * execute an object - */ -static void fscache_object_work_func(struct work_struct *work) -{ - struct fscache_object *object = - container_of(work, struct fscache_object, work); - - _enter("{OBJ%x}", object->debug_id); - - fscache_object_sm_dispatcher(object); - fscache_put_object(object, fscache_obj_put_work); -} - -/** - * fscache_object_init - Initialise a cache object description - * @object: Object description - * @cookie: Cookie object will be attached to - * @cache: Cache in which backing object will be found - * - * Initialise a cache object description to its basic values. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. - */ -void fscache_object_init(struct fscache_object *object, - struct fscache_cookie *cookie, - struct fscache_cache *cache) -{ - const struct fscache_transition *t; - - atomic_inc(&cache->object_count); - - object->state = STATE(WAIT_FOR_INIT); - object->oob_table = fscache_osm_init_oob; - object->flags = 1 << FSCACHE_OBJECT_IS_LIVE; - spin_lock_init(&object->lock); - INIT_LIST_HEAD(&object->cache_link); - INIT_HLIST_NODE(&object->cookie_link); - INIT_WORK(&object->work, fscache_object_work_func); - INIT_LIST_HEAD(&object->dependents); - INIT_LIST_HEAD(&object->dep_link); - INIT_LIST_HEAD(&object->pending_ops); - object->n_children = 0; - object->n_ops = object->n_in_progress = object->n_exclusive = 0; - object->events = 0; - object->store_limit = 0; - object->store_limit_l = 0; - object->cache = cache; - object->cookie = cookie; - fscache_cookie_get(cookie, fscache_cookie_get_attach_object); - object->parent = NULL; -#ifdef CONFIG_FSCACHE_OBJECT_LIST - RB_CLEAR_NODE(&object->objlist_link); -#endif - - object->oob_event_mask = 0; - for (t = object->oob_table; t->events; t++) - object->oob_event_mask |= t->events; - object->event_mask = object->oob_event_mask; - for (t = object->state->transitions; t->events; t++) - object->event_mask |= t->events; -} -EXPORT_SYMBOL(fscache_object_init); - -/* - * Mark the object as no longer being live, making sure that we synchronise - * against op submission. - */ -static inline void fscache_mark_object_dead(struct fscache_object *object) -{ - spin_lock(&object->lock); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); - spin_unlock(&object->lock); -} - -/* - * Abort object initialisation before we start it. - */ -static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_event_mask = 0; - fscache_dequeue_object(object); - return transit_to(KILL_OBJECT); -} - -/* - * initialise an object - * - check the specified object's parent to see if we can make use of it - * immediately to do a creation - * - we may need to start the process of creating a parent and we need to wait - * for the parent's lookup and creation to complete if it's not there yet - */ -static const struct fscache_state *fscache_initialise_object(struct fscache_object *object, - int event) -{ - struct fscache_object *parent; - bool success; - - _enter("{OBJ%x},%d", object->debug_id, event); - - ASSERT(list_empty(&object->dep_link)); - - parent = object->parent; - if (!parent) { - _leave(" [no parent]"); - return transit_to(DROP_OBJECT); - } - - _debug("parent: %s of:%lx", parent->state->name, parent->flags); - - if (fscache_object_is_dying(parent)) { - _leave(" [bad parent]"); - return transit_to(DROP_OBJECT); - } - - if (fscache_object_is_available(parent)) { - _leave(" [ready]"); - return transit_to(PARENT_READY); - } - - _debug("wait"); - - spin_lock(&parent->lock); - fscache_stat(&fscache_n_cop_grab_object); - success = false; - if (fscache_object_is_live(parent) && - object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) { - list_add(&object->dep_link, &parent->dependents); - success = true; - } - fscache_stat_d(&fscache_n_cop_grab_object); - spin_unlock(&parent->lock); - if (!success) { - _leave(" [grab failed]"); - return transit_to(DROP_OBJECT); - } - - /* fscache_acquire_non_index_cookie() uses this - * to wake the chain up */ - fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD); - _leave(" [wait]"); - return transit_to(WAIT_FOR_PARENT); -} - -/* - * Once the parent object is ready, we should kick off our lookup op. - */ -static const struct fscache_state *fscache_parent_ready(struct fscache_object *object, - int event) -{ - struct fscache_object *parent = object->parent; - - _enter("{OBJ%x},%d", object->debug_id, event); - - ASSERT(parent != NULL); - - spin_lock(&parent->lock); - parent->n_ops++; - parent->n_obj_ops++; - spin_unlock(&parent->lock); - - _leave(""); - return transit_to(LOOK_UP_OBJECT); -} - -/* - * look an object up in the cache from which it was allocated - * - we hold an "access lock" on the parent object, so the parent object cannot - * be withdrawn by either party till we've finished - */ -static const struct fscache_state *fscache_look_up_object(struct fscache_object *object, - int event) -{ - struct fscache_cookie *cookie = object->cookie; - struct fscache_object *parent = object->parent; - int ret; - - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_table = fscache_osm_lookup_oob; - - ASSERT(parent != NULL); - ASSERTCMP(parent->n_ops, >, 0); - ASSERTCMP(parent->n_obj_ops, >, 0); - - /* make sure the parent is still available */ - ASSERT(fscache_object_is_available(parent)); - - if (fscache_object_is_dying(parent) || - test_bit(FSCACHE_IOERROR, &object->cache->flags) || - !fscache_use_cookie(object)) { - _leave(" [unavailable]"); - return transit_to(LOOKUP_FAILURE); - } - - _debug("LOOKUP \"%s\" in \"%s\"", - cookie->def->name, object->cache->tag->name); - - fscache_stat(&fscache_n_object_lookups); - fscache_stat(&fscache_n_cop_lookup_object); - ret = object->cache->ops->lookup_object(object); - fscache_stat_d(&fscache_n_cop_lookup_object); - - fscache_unuse_cookie(object); - - if (ret == -ETIMEDOUT) { - /* probably stuck behind another object, so move this one to - * the back of the queue */ - fscache_stat(&fscache_n_object_lookups_timed_out); - _leave(" [timeout]"); - return NO_TRANSIT; - } - - if (ret < 0) { - _leave(" [error]"); - return transit_to(LOOKUP_FAILURE); - } - - _leave(" [ok]"); - return transit_to(OBJECT_AVAILABLE); -} - -/** - * fscache_object_lookup_negative - Note negative cookie lookup - * @object: Object pointing to cookie to mark - * - * Note negative lookup, permitting those waiting to read data from an already - * existing backing object to continue as there's no data for them to read. - */ -void fscache_object_lookup_negative(struct fscache_object *object) -{ - struct fscache_cookie *cookie = object->cookie; - - _enter("{OBJ%x,%s}", object->debug_id, object->state->name); - - if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { - fscache_stat(&fscache_n_object_lookups_negative); - - /* Allow write requests to begin stacking up and read requests to begin - * returning ENODATA. - */ - set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - - clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - } - _leave(""); -} -EXPORT_SYMBOL(fscache_object_lookup_negative); - -/** - * fscache_obtained_object - Note successful object lookup or creation - * @object: Object pointing to cookie to mark - * - * Note successful lookup and/or creation, permitting those waiting to write - * data to a backing object to continue. - * - * Note that after calling this, an object's cookie may be relinquished by the - * netfs, and so must be accessed with object lock held. - */ -void fscache_obtained_object(struct fscache_object *object) -{ - struct fscache_cookie *cookie = object->cookie; - - _enter("{OBJ%x,%s}", object->debug_id, object->state->name); - - /* if we were still looking up, then we must have a positive lookup - * result, in which case there may be data available */ - if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { - fscache_stat(&fscache_n_object_lookups_positive); - - /* We do (presumably) have data */ - clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - - /* Allow write requests to begin stacking up and read requests - * to begin shovelling data. - */ - clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - } else { - fscache_stat(&fscache_n_object_created); - } - - set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); - _leave(""); -} -EXPORT_SYMBOL(fscache_obtained_object); - -/* - * handle an object that has just become available - */ -static const struct fscache_state *fscache_object_available(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_table = fscache_osm_run_oob; - - spin_lock(&object->lock); - - fscache_done_parent_op(object); - if (object->n_in_progress == 0) { - if (object->n_ops > 0) { - ASSERTCMP(object->n_ops, >=, object->n_obj_ops); - fscache_start_operations(object); - } else { - ASSERT(list_empty(&object->pending_ops)); - } - } - spin_unlock(&object->lock); - - fscache_stat(&fscache_n_cop_lookup_complete); - object->cache->ops->lookup_complete(object); - fscache_stat_d(&fscache_n_cop_lookup_complete); - - fscache_stat(&fscache_n_object_avail); - - _leave(""); - return transit_to(JUMPSTART_DEPS); -} - -/* - * Wake up this object's dependent objects now that we've become available. - */ -static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY)) - return NO_TRANSIT; /* Not finished; requeue */ - return transit_to(WAIT_FOR_CMD); -} - -/* - * Handle lookup or creation failute. - */ -static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object, - int event) -{ - struct fscache_cookie *cookie; - - _enter("{OBJ%x},%d", object->debug_id, event); - - object->oob_event_mask = 0; - - fscache_stat(&fscache_n_cop_lookup_complete); - object->cache->ops->lookup_complete(object); - fscache_stat_d(&fscache_n_cop_lookup_complete); - - set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); - - cookie = object->cookie; - set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - - fscache_done_parent_op(object); - return transit_to(KILL_OBJECT); -} - -/* - * Wait for completion of all active operations on this object and the death of - * all child objects of this object. - */ -static const struct fscache_state *fscache_kill_object(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x,%d,%d},%d", - object->debug_id, object->n_ops, object->n_children, event); - - fscache_mark_object_dead(object); - object->oob_event_mask = 0; - - if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) { - /* Reject any new read/write ops and abort any that are pending. */ - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - fscache_cancel_all_ops(object); - } - - if (list_empty(&object->dependents) && - object->n_ops == 0 && - object->n_children == 0) - return transit_to(DROP_OBJECT); - - if (object->n_in_progress == 0) { - spin_lock(&object->lock); - if (object->n_ops > 0 && object->n_in_progress == 0) - fscache_start_operations(object); - spin_unlock(&object->lock); - } - - if (!list_empty(&object->dependents)) - return transit_to(KILL_DEPENDENTS); - - return transit_to(WAIT_FOR_CLEARANCE); -} - -/* - * Kill dependent objects. - */ -static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL)) - return NO_TRANSIT; /* Not finished */ - return transit_to(WAIT_FOR_CLEARANCE); -} - -/* - * Drop an object's attachments - */ -static const struct fscache_state *fscache_drop_object(struct fscache_object *object, - int event) -{ - struct fscache_object *parent = object->parent; - struct fscache_cookie *cookie = object->cookie; - struct fscache_cache *cache = object->cache; - bool awaken = false; - - _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event); - - ASSERT(cookie != NULL); - ASSERT(!hlist_unhashed(&object->cookie_link)); - - if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) { - _debug("final update"); - fscache_update_aux_data(object); - } - - /* Make sure the cookie no longer points here and that the netfs isn't - * waiting for us. - */ - spin_lock(&cookie->lock); - hlist_del_init(&object->cookie_link); - if (hlist_empty(&cookie->backing_objects) && - test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - awaken = true; - spin_unlock(&cookie->lock); - - if (awaken) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); - if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - - - /* Prevent a race with our last child, which has to signal EV_CLEARED - * before dropping our spinlock. - */ - spin_lock(&object->lock); - spin_unlock(&object->lock); - - /* Discard from the cache's collection of objects */ - spin_lock(&cache->object_list_lock); - list_del_init(&object->cache_link); - spin_unlock(&cache->object_list_lock); - - fscache_stat(&fscache_n_cop_drop_object); - cache->ops->drop_object(object); - fscache_stat_d(&fscache_n_cop_drop_object); - - /* The parent object wants to know when all it dependents have gone */ - if (parent) { - _debug("release parent OBJ%x {%d}", - parent->debug_id, parent->n_children); - - spin_lock(&parent->lock); - parent->n_children--; - if (parent->n_children == 0) - fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); - spin_unlock(&parent->lock); - object->parent = NULL; - } - - /* this just shifts the object release to the work processor */ - fscache_put_object(object, fscache_obj_put_drop_obj); - fscache_stat(&fscache_n_object_dead); - - _leave(""); - return transit_to(OBJECT_DEAD); -} - -/* - * get a ref on an object - */ -static int fscache_get_object(struct fscache_object *object, - enum fscache_obj_ref_trace why) -{ - int ret; - - fscache_stat(&fscache_n_cop_grab_object); - ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN; - fscache_stat_d(&fscache_n_cop_grab_object); - return ret; -} - -/* - * Discard a ref on an object - */ -static void fscache_put_object(struct fscache_object *object, - enum fscache_obj_ref_trace why) -{ - fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object, why); - fscache_stat_d(&fscache_n_cop_put_object); -} - -/** - * fscache_object_destroy - Note that a cache object is about to be destroyed - * @object: The object to be destroyed - * - * Note the imminent destruction and deallocation of a cache object record. - */ -void fscache_object_destroy(struct fscache_object *object) -{ - /* We can get rid of the cookie now */ - fscache_cookie_put(object->cookie, fscache_cookie_put_object); - object->cookie = NULL; -} -EXPORT_SYMBOL(fscache_object_destroy); - -/* - * enqueue an object for metadata-type processing - */ -void fscache_enqueue_object(struct fscache_object *object) -{ - _enter("{OBJ%x}", object->debug_id); - - if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { - wait_queue_head_t *cong_wq = - &get_cpu_var(fscache_object_cong_wait); - - if (queue_work(fscache_object_wq, &object->work)) { - if (fscache_object_congested()) - wake_up(cong_wq); - } else - fscache_put_object(object, fscache_obj_put_queue); - - put_cpu_var(fscache_object_cong_wait); - } -} - -/** - * fscache_object_sleep_till_congested - Sleep until object wq is congested - * @timeoutp: Scheduler sleep timeout - * - * Allow an object handler to sleep until the object workqueue is congested. - * - * The caller must set up a wake up event before calling this and must have set - * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own - * condition before calling this function as no test is made here. - * - * %true is returned if the object wq is congested, %false otherwise. - */ -bool fscache_object_sleep_till_congested(signed long *timeoutp) -{ - wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); - DEFINE_WAIT(wait); - - if (fscache_object_congested()) - return true; - - add_wait_queue_exclusive(cong_wq, &wait); - if (!fscache_object_congested()) - *timeoutp = schedule_timeout(*timeoutp); - finish_wait(cong_wq, &wait); - - return fscache_object_congested(); -} -EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); - -/* - * Enqueue the dependents of an object for metadata-type processing. - * - * If we don't manage to finish the list before the scheduler wants to run - * again then return false immediately. We return true if the list was - * cleared. - */ -static bool fscache_enqueue_dependents(struct fscache_object *object, int event) -{ - struct fscache_object *dep; - bool ret = true; - - _enter("{OBJ%x}", object->debug_id); - - if (list_empty(&object->dependents)) - return true; - - spin_lock(&object->lock); - - while (!list_empty(&object->dependents)) { - dep = list_entry(object->dependents.next, - struct fscache_object, dep_link); - list_del_init(&dep->dep_link); - - fscache_raise_event(dep, event); - fscache_put_object(dep, fscache_obj_put_enq_dep); - - if (!list_empty(&object->dependents) && need_resched()) { - ret = false; - break; - } - } - - spin_unlock(&object->lock); - return ret; -} - -/* - * remove an object from whatever queue it's waiting on - */ -static void fscache_dequeue_object(struct fscache_object *object) -{ - _enter("{OBJ%x}", object->debug_id); - - if (!list_empty(&object->dep_link)) { - spin_lock(&object->parent->lock); - list_del_init(&object->dep_link); - spin_unlock(&object->parent->lock); - } - - _leave(""); -} - -/** - * fscache_check_aux - Ask the netfs whether an object on disk is still valid - * @object: The object to ask about - * @data: The auxiliary data for the object - * @datalen: The size of the auxiliary data - * @object_size: The size of the object according to the server. - * - * This function consults the netfs about the coherency state of an object. - * The caller must be holding a ref on cookie->n_active (held by - * fscache_look_up_object() on behalf of the cache backend during object lookup - * and creation). - */ -enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, uint16_t datalen, - loff_t object_size) -{ - enum fscache_checkaux result; - - if (!object->cookie->def->check_aux) { - fscache_stat(&fscache_n_checkaux_none); - return FSCACHE_CHECKAUX_OKAY; - } - - result = object->cookie->def->check_aux(object->cookie->netfs_data, - data, datalen, object_size); - switch (result) { - /* entry okay as is */ - case FSCACHE_CHECKAUX_OKAY: - fscache_stat(&fscache_n_checkaux_okay); - break; - - /* entry requires update */ - case FSCACHE_CHECKAUX_NEEDS_UPDATE: - fscache_stat(&fscache_n_checkaux_update); - break; - - /* entry requires deletion */ - case FSCACHE_CHECKAUX_OBSOLETE: - fscache_stat(&fscache_n_checkaux_obsolete); - break; - - default: - BUG(); - } - - return result; -} -EXPORT_SYMBOL(fscache_check_aux); - -/* - * Asynchronously invalidate an object. - */ -static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object, - int event) -{ - struct fscache_operation *op; - struct fscache_cookie *cookie = object->cookie; - - _enter("{OBJ%x},%d", object->debug_id, event); - - /* We're going to need the cookie. If the cookie is not available then - * retire the object instead. - */ - if (!fscache_use_cookie(object)) { - ASSERT(radix_tree_empty(&object->cookie->stores)); - set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); - _leave(" [no cookie]"); - return transit_to(KILL_OBJECT); - } - - /* Reject any new read/write ops and abort any that are pending. */ - fscache_invalidate_writes(cookie); - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - fscache_cancel_all_ops(object); - - /* Now we have to wait for in-progress reads and writes */ - op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) - goto nomem; - - fscache_operation_init(cookie, op, object->cache->ops->invalidate_object, - NULL, NULL); - op->flags = FSCACHE_OP_ASYNC | - (1 << FSCACHE_OP_EXCLUSIVE) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate); - - spin_lock(&cookie->lock); - if (fscache_submit_exclusive_op(object, op) < 0) - goto submit_op_failed; - spin_unlock(&cookie->lock); - fscache_put_operation(op); - - /* Once we've completed the invalidation, we know there will be no data - * stored in the cache and thus we can reinstate the data-check-skip - * optimisation. - */ - set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - - /* We can allow read and write requests to come in once again. They'll - * queue up behind our exclusive invalidation operation. - */ - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); - _leave(" [ok]"); - return transit_to(UPDATE_OBJECT); - -nomem: - fscache_mark_object_dead(object); - fscache_unuse_cookie(object); - _leave(" [ENOMEM]"); - return transit_to(KILL_OBJECT); - -submit_op_failed: - fscache_mark_object_dead(object); - spin_unlock(&cookie->lock); - fscache_unuse_cookie(object); - kfree(op); - _leave(" [EIO]"); - return transit_to(KILL_OBJECT); -} - -static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object, - int event) -{ - const struct fscache_state *s; - - fscache_stat(&fscache_n_invalidates_run); - fscache_stat(&fscache_n_cop_invalidate_object); - s = _fscache_invalidate_object(object, event); - fscache_stat_d(&fscache_n_cop_invalidate_object); - return s; -} - -/* - * Update auxiliary data. - */ -static void fscache_update_aux_data(struct fscache_object *object) -{ - fscache_stat(&fscache_n_updates_run); - fscache_stat(&fscache_n_cop_update_object); - object->cache->ops->update_object(object); - fscache_stat_d(&fscache_n_cop_update_object); -} - -/* - * Asynchronously update an object. - */ -static const struct fscache_state *fscache_update_object(struct fscache_object *object, - int event) -{ - _enter("{OBJ%x},%d", object->debug_id, event); - - fscache_update_aux_data(object); - - _leave(""); - return transit_to(WAIT_FOR_CMD); -} - -/** - * fscache_object_retrying_stale - Note retrying stale object - * @object: The object that will be retried - * - * Note that an object lookup found an on-disk object that was adjudged to be - * stale and has been deleted. The lookup will be retried. - */ -void fscache_object_retrying_stale(struct fscache_object *object) -{ - fscache_stat(&fscache_n_cache_no_space_reject); -} -EXPORT_SYMBOL(fscache_object_retrying_stale); - -/** - * fscache_object_mark_killed - Note that an object was killed - * @object: The object that was culled - * @why: The reason the object was killed. - * - * Note that an object was killed. Returns true if the object was - * already marked killed, false if it wasn't. - */ -void fscache_object_mark_killed(struct fscache_object *object, - enum fscache_why_object_killed why) -{ - if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { - pr_err("Error: Object already killed by cache [%s]\n", - object->cache->identifier); - return; - } - - switch (why) { - case FSCACHE_OBJECT_NO_SPACE: - fscache_stat(&fscache_n_cache_no_space_reject); - break; - case FSCACHE_OBJECT_IS_STALE: - fscache_stat(&fscache_n_cache_stale_objects); - break; - case FSCACHE_OBJECT_WAS_RETIRED: - fscache_stat(&fscache_n_cache_retired_objects); - break; - case FSCACHE_OBJECT_WAS_CULLED: - fscache_stat(&fscache_n_cache_culled_objects); - break; - } -} -EXPORT_SYMBOL(fscache_object_mark_killed); - -/* - * The object is dead. We can get here if an object gets queued by an event - * that would lead to its death (such as EV_KILL) when the dispatcher is - * already running (and so can be requeued) but hasn't yet cleared the event - * mask. - */ -static const struct fscache_state *fscache_object_dead(struct fscache_object *object, - int event) -{ - if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, - &object->flags)) - return NO_TRANSIT; - - WARN(true, "FS-Cache object redispatched after death"); - return NO_TRANSIT; -} diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c deleted file mode 100644 index e002cdfaf3cc..000000000000 --- a/fs/fscache/operation.c +++ /dev/null @@ -1,633 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache worker operation management routines - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * See Documentation/filesystems/caching/operations.rst - */ - -#define FSCACHE_DEBUG_LEVEL OPERATION -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include "internal.h" - -atomic_t fscache_op_debug_id; -EXPORT_SYMBOL(fscache_op_debug_id); - -static void fscache_operation_dummy_cancel(struct fscache_operation *op) -{ -} - -/** - * fscache_operation_init - Do basic initialisation of an operation - * @cookie: The cookie to operate on - * @op: The operation to initialise - * @processor: The function to perform the operation - * @cancel: A function to handle operation cancellation - * @release: The release function to assign - * - * Do basic initialisation of an operation. The caller must still set flags, - * object and processor if needed. - */ -void fscache_operation_init(struct fscache_cookie *cookie, - struct fscache_operation *op, - fscache_operation_processor_t processor, - fscache_operation_cancel_t cancel, - fscache_operation_release_t release) -{ - INIT_WORK(&op->work, fscache_op_work_func); - atomic_set(&op->usage, 1); - op->state = FSCACHE_OP_ST_INITIALISED; - op->debug_id = atomic_inc_return(&fscache_op_debug_id); - op->processor = processor; - op->cancel = cancel ?: fscache_operation_dummy_cancel; - op->release = release; - INIT_LIST_HEAD(&op->pend_link); - fscache_stat(&fscache_n_op_initialised); - trace_fscache_op(cookie, op, fscache_op_init); -} -EXPORT_SYMBOL(fscache_operation_init); - -/** - * fscache_enqueue_operation - Enqueue an operation for processing - * @op: The operation to enqueue - * - * Enqueue an operation for processing by the FS-Cache thread pool. - * - * This will get its own ref on the object. - */ -void fscache_enqueue_operation(struct fscache_operation *op) -{ - struct fscache_cookie *cookie = op->object->cookie; - - _enter("{OBJ%x OP%x,%u}", - op->object->debug_id, op->debug_id, atomic_read(&op->usage)); - - ASSERT(list_empty(&op->pend_link)); - ASSERT(op->processor != NULL); - ASSERT(fscache_object_is_available(op->object)); - ASSERTCMP(atomic_read(&op->usage), >, 0); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS, - op->state, ==, FSCACHE_OP_ST_CANCELLED); - - fscache_stat(&fscache_n_op_enqueue); - switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_ASYNC: - trace_fscache_op(cookie, op, fscache_op_enqueue_async); - _debug("queue async"); - atomic_inc(&op->usage); - if (!queue_work(fscache_op_wq, &op->work)) - fscache_put_operation(op); - break; - case FSCACHE_OP_MYTHREAD: - trace_fscache_op(cookie, op, fscache_op_enqueue_mythread); - _debug("queue for caller's attention"); - break; - default: - pr_err("Unexpected op type %lx", op->flags); - BUG(); - break; - } -} -EXPORT_SYMBOL(fscache_enqueue_operation); - -/* - * start an op running - */ -static void fscache_run_op(struct fscache_object *object, - struct fscache_operation *op) -{ - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); - - op->state = FSCACHE_OP_ST_IN_PROGRESS; - object->n_in_progress++; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - if (op->processor) - fscache_enqueue_operation(op); - else - trace_fscache_op(object->cookie, op, fscache_op_run); - fscache_stat(&fscache_n_op_run); -} - -/* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - -/* - * submit an exclusive operation for an object - * - other ops are excluded from running simultaneously with this one - * - this gets any extra refs it needs on an op - */ -int fscache_submit_exclusive_op(struct fscache_object *object, - struct fscache_operation *op) -{ - const struct fscache_state *ostate; - unsigned long flags; - int ret; - - _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); - - trace_fscache_op(object->cookie, op, fscache_op_submit_ex); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); - ASSERTCMP(atomic_read(&op->usage), >, 0); - - spin_lock(&object->lock); - ASSERTCMP(object->n_ops, >=, object->n_in_progress); - ASSERTCMP(object->n_ops, >=, object->n_exclusive); - ASSERT(list_empty(&op->pend_link)); - - ostate = object->state; - smp_rmb(); - - op->state = FSCACHE_OP_ST_PENDING; - flags = READ_ONCE(object->flags); - if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { - fscache_stat(&fscache_n_op_rejected); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else if (unlikely(fscache_cache_is_broken(object))) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -EIO; - } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { - op->object = object; - object->n_ops++; - object->n_exclusive++; /* reads and writes must wait */ - - if (object->n_in_progress > 0) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - } else if (!list_empty(&object->pending_ops)) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - fscache_start_operations(object); - } else { - ASSERTCMP(object->n_in_progress, ==, 0); - fscache_run_op(object, op); - } - - /* need to issue a new write op after this */ - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { - op->object = object; - object->n_ops++; - object->n_exclusive++; /* reads and writes must wait */ - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { - fscache_report_unexpected_submission(object, op, ostate); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } - - spin_unlock(&object->lock); - return ret; -} - -/* - * submit an operation for an object - * - objects may be submitted only in the following states: - * - during object creation (write ops may be submitted) - * - whilst the object is active - * - after an I/O error incurred in one of the two above states (op rejected) - * - this gets any extra refs it needs on an op - */ -int fscache_submit_op(struct fscache_object *object, - struct fscache_operation *op) -{ - const struct fscache_state *ostate; - unsigned long flags; - int ret; - - _enter("{OBJ%x OP%x},{%u}", - object->debug_id, op->debug_id, atomic_read(&op->usage)); - - trace_fscache_op(object->cookie, op, fscache_op_submit); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); - ASSERTCMP(atomic_read(&op->usage), >, 0); - - spin_lock(&object->lock); - ASSERTCMP(object->n_ops, >=, object->n_in_progress); - ASSERTCMP(object->n_ops, >=, object->n_exclusive); - ASSERT(list_empty(&op->pend_link)); - - ostate = object->state; - smp_rmb(); - - op->state = FSCACHE_OP_ST_PENDING; - flags = READ_ONCE(object->flags); - if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { - fscache_stat(&fscache_n_op_rejected); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else if (unlikely(fscache_cache_is_broken(object))) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -EIO; - } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { - op->object = object; - object->n_ops++; - - if (object->n_exclusive > 0) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - } else if (!list_empty(&object->pending_ops)) { - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - fscache_start_operations(object); - } else { - ASSERTCMP(object->n_exclusive, ==, 0); - fscache_run_op(object, op); - } - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { - op->object = object; - object->n_ops++; - atomic_inc(&op->usage); - list_add_tail(&op->pend_link, &object->pending_ops); - fscache_stat(&fscache_n_op_pend); - ret = 0; - } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { - fscache_report_unexpected_submission(object, op, ostate); - ASSERT(!fscache_object_is_active(object)); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } - - spin_unlock(&object->lock); - return ret; -} - -/* - * queue an object for withdrawal on error, aborting all following asynchronous - * operations - */ -void fscache_abort_object(struct fscache_object *object) -{ - _enter("{OBJ%x}", object->debug_id); - - fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); -} - -/* - * Jump start the operation processing on an object. The caller must hold - * object->lock. - */ -void fscache_start_operations(struct fscache_object *object) -{ - struct fscache_operation *op; - bool stop = false; - - while (!list_empty(&object->pending_ops) && !stop) { - op = list_entry(object->pending_ops.next, - struct fscache_operation, pend_link); - - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { - if (object->n_in_progress > 0) - break; - stop = true; - } - list_del_init(&op->pend_link); - fscache_run_op(object, op); - - /* the pending queue was holding a ref on the object */ - fscache_put_operation(op); - } - - ASSERTCMP(object->n_in_progress, <=, object->n_ops); - - _debug("woke %d ops on OBJ%x", - object->n_in_progress, object->debug_id); -} - -/* - * cancel an operation that's pending on an object - */ -int fscache_cancel_op(struct fscache_operation *op, - bool cancel_in_progress_op) -{ - struct fscache_object *object = op->object; - bool put = false; - int ret; - - _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); - - trace_fscache_op(object->cookie, op, fscache_op_cancel); - - ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); - ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); - ASSERTCMP(atomic_read(&op->usage), >, 0); - - spin_lock(&object->lock); - - ret = -EBUSY; - if (op->state == FSCACHE_OP_ST_PENDING) { - ASSERT(!list_empty(&op->pend_link)); - list_del_init(&op->pend_link); - put = true; - - fscache_stat(&fscache_n_op_cancelled); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - ret = 0; - } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { - ASSERTCMP(object->n_in_progress, >, 0); - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - object->n_in_progress--; - if (object->n_in_progress == 0) - fscache_start_operations(object); - - fscache_stat(&fscache_n_op_cancelled); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - ret = 0; - } - - if (put) - fscache_put_operation(op); - spin_unlock(&object->lock); - _leave(" = %d", ret); - return ret; -} - -/* - * Cancel all pending operations on an object - */ -void fscache_cancel_all_ops(struct fscache_object *object) -{ - struct fscache_operation *op; - - _enter("OBJ%x", object->debug_id); - - spin_lock(&object->lock); - - while (!list_empty(&object->pending_ops)) { - op = list_entry(object->pending_ops.next, - struct fscache_operation, pend_link); - fscache_stat(&fscache_n_op_cancelled); - list_del_init(&op->pend_link); - - trace_fscache_op(object->cookie, op, fscache_op_cancel_all); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); - op->cancel(op); - op->state = FSCACHE_OP_ST_CANCELLED; - - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); - cond_resched_lock(&object->lock); - } - - spin_unlock(&object->lock); - _leave(""); -} - -/* - * Record the completion or cancellation of an in-progress operation. - */ -void fscache_op_complete(struct fscache_operation *op, bool cancelled) -{ - struct fscache_object *object = op->object; - - _enter("OBJ%x", object->debug_id); - - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); - ASSERTCMP(object->n_in_progress, >, 0); - ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), - object->n_exclusive, >, 0); - ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), - object->n_in_progress, ==, 1); - - spin_lock(&object->lock); - - if (!cancelled) { - trace_fscache_op(object->cookie, op, fscache_op_completed); - op->state = FSCACHE_OP_ST_COMPLETE; - } else { - op->cancel(op); - trace_fscache_op(object->cookie, op, fscache_op_cancelled); - op->state = FSCACHE_OP_ST_CANCELLED; - } - - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) - object->n_exclusive--; - object->n_in_progress--; - if (object->n_in_progress == 0) - fscache_start_operations(object); - - spin_unlock(&object->lock); - _leave(""); -} -EXPORT_SYMBOL(fscache_op_complete); - -/* - * release an operation - * - queues pending ops if this is the last in-progress op - */ -void fscache_put_operation(struct fscache_operation *op) -{ - struct fscache_object *object; - struct fscache_cache *cache; - - _enter("{OBJ%x OP%x,%d}", - op->object ? op->object->debug_id : 0, - op->debug_id, atomic_read(&op->usage)); - - ASSERTCMP(atomic_read(&op->usage), >, 0); - - if (!atomic_dec_and_test(&op->usage)) - return; - - trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put); - - _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && - op->state != FSCACHE_OP_ST_COMPLETE, - op->state, ==, FSCACHE_OP_ST_CANCELLED); - - fscache_stat(&fscache_n_op_release); - - if (op->release) { - op->release(op); - op->release = NULL; - } - op->state = FSCACHE_OP_ST_DEAD; - - object = op->object; - if (likely(object)) { - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); - - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; - } - - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - } - - kfree(op); - _leave(" [done]"); -} -EXPORT_SYMBOL(fscache_put_operation); - -/* - * garbage collect operations that have had their release deferred - */ -void fscache_operation_gc(struct work_struct *work) -{ - struct fscache_operation *op; - struct fscache_object *object; - struct fscache_cache *cache = - container_of(work, struct fscache_cache, op_gc); - int count = 0; - - _enter(""); - - do { - spin_lock(&cache->op_gc_list_lock); - if (list_empty(&cache->op_gc_list)) { - spin_unlock(&cache->op_gc_list_lock); - break; - } - - op = list_entry(cache->op_gc_list.next, - struct fscache_operation, pend_link); - list_del(&op->pend_link); - spin_unlock(&cache->op_gc_list_lock); - - object = op->object; - trace_fscache_op(object->cookie, op, fscache_op_gc); - - spin_lock(&object->lock); - - _debug("GC DEFERRED REL OBJ%x OP%x", - object->debug_id, op->debug_id); - fscache_stat(&fscache_n_op_gc); - - ASSERTCMP(atomic_read(&op->usage), ==, 0); - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD); - - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); - - } while (count++ < 20); - - if (!list_empty(&cache->op_gc_list)) - schedule_work(&cache->op_gc); - - _leave(""); -} - -/* - * execute an operation using fs_op_wq to provide processing context - - * the caller holds a ref to this object, so we don't need to hold one - */ -void fscache_op_work_func(struct work_struct *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, work); - - _enter("{OBJ%x OP%x,%d}", - op->object->debug_id, op->debug_id, atomic_read(&op->usage)); - - trace_fscache_op(op->object->cookie, op, fscache_op_work); - - ASSERT(op->processor != NULL); - op->processor(op); - fscache_put_operation(op); - - _leave(""); -} diff --git a/fs/fscache/page.c b/fs/fscache/page.c deleted file mode 100644 index 27df94ef0e0b..000000000000 --- a/fs/fscache/page.c +++ /dev/null @@ -1,1242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Cache page management and data I/O routines - * - * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL PAGE -#include <linux/module.h> -#include <linux/fscache-cache.h> -#include <linux/buffer_head.h> -#include <linux/pagevec.h> -#include <linux/slab.h> -#include "internal.h" - -/* - * check to see if a page is being written to the cache - */ -bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page) -{ - void *val; - - rcu_read_lock(); - val = radix_tree_lookup(&cookie->stores, page->index); - rcu_read_unlock(); - trace_fscache_check_page(cookie, page, val, 0); - - return val != NULL; -} -EXPORT_SYMBOL(__fscache_check_page_write); - -/* - * wait for a page to finish being written to the cache - */ -void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page) -{ - wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); - - trace_fscache_page(cookie, page, fscache_page_write_wait); - - wait_event(*wq, !__fscache_check_page_write(cookie, page)); -} -EXPORT_SYMBOL(__fscache_wait_on_page_write); - -/* - * wait for a page to finish being written to the cache. Put a timeout here - * since we might be called recursively via parent fs. - */ -static -bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) -{ - wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); - - return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page), - HZ); -} - -/* - * decide whether a page can be released, possibly by cancelling a store to it - * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged - */ -bool __fscache_maybe_release_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp) -{ - struct page *xpage; - void *val; - - _enter("%p,%p,%x", cookie, page, gfp); - - trace_fscache_page(cookie, page, fscache_page_maybe_release); - -try_again: - rcu_read_lock(); - val = radix_tree_lookup(&cookie->stores, page->index); - if (!val) { - rcu_read_unlock(); - fscache_stat(&fscache_n_store_vmscan_not_storing); - __fscache_uncache_page(cookie, page); - return true; - } - - /* see if the page is actually undergoing storage - if so we can't get - * rid of it till the cache has finished with it */ - if (radix_tree_tag_get(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG)) { - rcu_read_unlock(); - goto page_busy; - } - - /* the page is pending storage, so we attempt to cancel the store and - * discard the store request so that the page can be reclaimed */ - spin_lock(&cookie->stores_lock); - rcu_read_unlock(); - - if (radix_tree_tag_get(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG)) { - /* the page started to undergo storage whilst we were looking, - * so now we can only wait or return */ - spin_unlock(&cookie->stores_lock); - goto page_busy; - } - - xpage = radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - spin_unlock(&cookie->stores_lock); - - if (xpage) { - fscache_stat(&fscache_n_store_vmscan_cancelled); - fscache_stat(&fscache_n_store_radix_deletes); - ASSERTCMP(xpage, ==, page); - } else { - fscache_stat(&fscache_n_store_vmscan_gone); - } - - wake_up_bit(&cookie->flags, 0); - trace_fscache_wake_cookie(cookie); - if (xpage) - put_page(xpage); - __fscache_uncache_page(cookie, page); - return true; - -page_busy: - /* We will wait here if we're allowed to, but that could deadlock the - * allocator as the work threads writing to the cache may all end up - * sleeping on memory allocation, so we may need to impose a timeout - * too. */ - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) { - fscache_stat(&fscache_n_store_vmscan_busy); - return false; - } - - fscache_stat(&fscache_n_store_vmscan_wait); - if (!release_page_wait_timeout(cookie, page)) - _debug("fscache writeout timeout page: %p{%lx}", - page, page->index); - - gfp &= ~__GFP_DIRECT_RECLAIM; - goto try_again; -} -EXPORT_SYMBOL(__fscache_maybe_release_page); - -/* - * note that a page has finished being written to the cache - */ -static void fscache_end_page_write(struct fscache_object *object, - struct page *page) -{ - struct fscache_cookie *cookie; - struct page *xpage = NULL, *val; - - spin_lock(&object->lock); - cookie = object->cookie; - if (cookie) { - /* delete the page from the tree if it is now no longer - * pending */ - spin_lock(&cookie->stores_lock); - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG); - trace_fscache_page(cookie, page, fscache_page_radix_clear_store); - if (!radix_tree_tag_get(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG)) { - fscache_stat(&fscache_n_store_radix_deletes); - xpage = radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - trace_fscache_page(cookie, page, fscache_page_write_end); - - val = radix_tree_lookup(&cookie->stores, page->index); - trace_fscache_check_page(cookie, page, val, 1); - } else { - trace_fscache_page(cookie, page, fscache_page_write_end_pend); - } - spin_unlock(&cookie->stores_lock); - wake_up_bit(&cookie->flags, 0); - trace_fscache_wake_cookie(cookie); - } else { - trace_fscache_page(cookie, page, fscache_page_write_end_noc); - } - spin_unlock(&object->lock); - if (xpage) - put_page(xpage); -} - -/* - * actually apply the changed attributes to a cache object - */ -static void fscache_attr_changed_op(struct fscache_operation *op) -{ - struct fscache_object *object = op->object; - int ret; - - _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); - - fscache_stat(&fscache_n_attr_changed_calls); - - if (fscache_object_is_active(object)) { - fscache_stat(&fscache_n_cop_attr_changed); - ret = object->cache->ops->attr_changed(object); - fscache_stat_d(&fscache_n_cop_attr_changed); - if (ret < 0) - fscache_abort_object(object); - fscache_op_complete(op, ret < 0); - } else { - fscache_op_complete(op, true); - } - - _leave(""); -} - -/* - * notification that the attributes on an object have changed - */ -int __fscache_attr_changed(struct fscache_cookie *cookie) -{ - struct fscache_operation *op; - struct fscache_object *object; - bool wake_cookie = false; - - _enter("%p", cookie); - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - - fscache_stat(&fscache_n_attr_changed); - - op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - fscache_stat(&fscache_n_attr_changed_nomem); - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL); - trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed); - op->flags = FSCACHE_OP_ASYNC | - (1 << FSCACHE_OP_EXCLUSIVE) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - __fscache_use_cookie(cookie); - if (fscache_submit_exclusive_op(object, op) < 0) - goto nobufs_dec; - spin_unlock(&cookie->lock); - fscache_stat(&fscache_n_attr_changed_ok); - fscache_put_operation(op); - _leave(" = 0"); - return 0; - -nobufs_dec: - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs: - spin_unlock(&cookie->lock); - fscache_put_operation(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - fscache_stat(&fscache_n_attr_changed_nobufs); - _leave(" = %d", -ENOBUFS); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_attr_changed); - -/* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - -/* - * release a retrieval op reference - */ -static void fscache_release_retrieval_op(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - _enter("{OP%x}", op->op.debug_id); - - ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, - atomic_read(&op->n_pages), ==, 0); - - if (op->context) - fscache_put_context(op->cookie, op->context); - - _leave(""); -} - -/* - * allocate a retrieval op - */ -struct fscache_retrieval *fscache_alloc_retrieval( - struct fscache_cookie *cookie, - struct address_space *mapping, - fscache_rw_complete_t end_io_func, - void *context) -{ - struct fscache_retrieval *op; - - /* allocate a retrieval operation and attempt to submit it */ - op = kzalloc(sizeof(*op), GFP_NOIO); - if (!op) { - fscache_stat(&fscache_n_retrievals_nomem); - return NULL; - } - - fscache_operation_init(cookie, &op->op, NULL, - fscache_do_cancel_retrieval, - fscache_release_retrieval_op); - op->op.flags = FSCACHE_OP_MYTHREAD | - (1UL << FSCACHE_OP_WAITING) | - (1UL << FSCACHE_OP_UNUSE_COOKIE); - op->cookie = cookie; - op->mapping = mapping; - op->end_io_func = end_io_func; - op->context = context; - INIT_LIST_HEAD(&op->to_do); - - /* Pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure. - */ - if (context) - fscache_get_context(op->cookie, context); - return op; -} - -/* - * wait for a deferred lookup to complete - */ -int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) -{ - _enter(""); - - if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { - _leave(" = 0 [imm]"); - return 0; - } - - fscache_stat(&fscache_n_retrievals_wait); - - if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - TASK_INTERRUPTIBLE) != 0) { - fscache_stat(&fscache_n_retrievals_intr); - _leave(" = -ERESTARTSYS"); - return -ERESTARTSYS; - } - - ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); - - smp_rmb(); - _leave(" = 0 [dly]"); - return 0; -} - -/* - * wait for an object to become active (or dead) - */ -int fscache_wait_for_operation_activation(struct fscache_object *object, - struct fscache_operation *op, - atomic_t *stat_op_waits, - atomic_t *stat_object_dead) -{ - int ret; - - if (!test_bit(FSCACHE_OP_WAITING, &op->flags)) - goto check_if_dead; - - _debug(">>> WT"); - if (stat_op_waits) - fscache_stat(stat_op_waits); - if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - TASK_INTERRUPTIBLE) != 0) { - trace_fscache_op(object->cookie, op, fscache_op_signal); - ret = fscache_cancel_op(op, false); - if (ret == 0) - return -ERESTARTSYS; - - /* it's been removed from the pending queue by another party, - * so we should get to run shortly */ - wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - -check_if_dead: - if (op->state == FSCACHE_OP_ST_CANCELLED) { - if (stat_object_dead) - fscache_stat(stat_object_dead); - _leave(" = -ENOBUFS [cancelled]"); - return -ENOBUFS; - } - if (unlikely(fscache_object_is_dying(object) || - fscache_cache_is_broken(object))) { - enum fscache_operation_state state = op->state; - trace_fscache_op(object->cookie, op, fscache_op_signal); - fscache_cancel_op(op, true); - if (stat_object_dead) - fscache_stat(stat_object_dead); - _leave(" = -ENOBUFS [obj dead %d]", state); - return -ENOBUFS; - } - return 0; -} - -/* - * read a page from the cache or allocate a block in which to store it - * - we return: - * -ENOMEM - out of memory, nothing done - * -ERESTARTSYS - interrupted - * -ENOBUFS - no backing object available in which to cache the block - * -ENODATA - no data available in the backing object for this block - * 0 - dispatched a read - it'll call end_io_func() when finished - */ -int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, - struct page *page, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp) -{ - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,%p,,,", cookie, page); - - fscache_stat(&fscache_n_retrievals); - - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(page, !=, NULL); - - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; - - op = fscache_alloc_retrieval(cookie, page->mapping, - end_io_func, context); - if (!op) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - atomic_set(&op->n_pages, 1); - trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); - - __fscache_use_cookie(cookie); - atomic_inc(&object->n_reads); - __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); - - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; - spin_unlock(&cookie->lock); - - fscache_stat(&fscache_n_retrieval_ops); - - /* we wait for the operation to become active, and then process it - * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { - fscache_stat(&fscache_n_cop_allocate_page); - ret = object->cache->ops->allocate_page(op, page, gfp); - fscache_stat_d(&fscache_n_cop_allocate_page); - if (ret == 0) - ret = -ENODATA; - } else { - fscache_stat(&fscache_n_cop_read_or_alloc_page); - ret = object->cache->ops->read_or_alloc_page(op, page, gfp); - fscache_stat_d(&fscache_n_cop_read_or_alloc_page); - } - -error: - if (ret == -ENOMEM) - fscache_stat(&fscache_n_retrievals_nomem); - else if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_retrievals_intr); - else if (ret == -ENODATA) - fscache_stat(&fscache_n_retrievals_nodata); - else if (ret < 0) - fscache_stat(&fscache_n_retrievals_nobufs); - else - fscache_stat(&fscache_n_retrievals_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - atomic_dec(&object->n_reads); - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: - spin_unlock(&cookie->lock); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - fscache_put_retrieval(op); -nobufs: - fscache_stat(&fscache_n_retrievals_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_read_or_alloc_page); - -/* - * read a list of page from the cache or allocate a block in which to store - * them - * - we return: - * -ENOMEM - out of memory, some pages may be being read - * -ERESTARTSYS - interrupted, some pages may be being read - * -ENOBUFS - no backing object or space available in which to cache any - * pages not being read - * -ENODATA - no data available in the backing object for some or all of - * the pages - * 0 - dispatched a read on all pages - * - * end_io_func() will be called for each page read from the cache as it is - * finishes being read - * - * any pages for which a read is dispatched will be removed from pages and - * nr_pages - */ -int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp) -{ - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,,%d,,,", cookie, *nr_pages); - - fscache_stat(&fscache_n_retrievals); - - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(*nr_pages, >, 0); - ASSERT(!list_empty(pages)); - - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; - - op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context); - if (!op) - return -ENOMEM; - atomic_set(&op->n_pages, *nr_pages); - trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - __fscache_use_cookie(cookie); - atomic_inc(&object->n_reads); - __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); - - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; - spin_unlock(&cookie->lock); - - fscache_stat(&fscache_n_retrieval_ops); - - /* we wait for the operation to become active, and then process it - * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { - fscache_stat(&fscache_n_cop_allocate_pages); - ret = object->cache->ops->allocate_pages( - op, pages, nr_pages, gfp); - fscache_stat_d(&fscache_n_cop_allocate_pages); - } else { - fscache_stat(&fscache_n_cop_read_or_alloc_pages); - ret = object->cache->ops->read_or_alloc_pages( - op, pages, nr_pages, gfp); - fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); - } - -error: - if (ret == -ENOMEM) - fscache_stat(&fscache_n_retrievals_nomem); - else if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_retrievals_intr); - else if (ret == -ENODATA) - fscache_stat(&fscache_n_retrievals_nodata); - else if (ret < 0) - fscache_stat(&fscache_n_retrievals_nobufs); - else - fscache_stat(&fscache_n_retrievals_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - atomic_dec(&object->n_reads); - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: - spin_unlock(&cookie->lock); - fscache_put_retrieval(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); -nobufs: - fscache_stat(&fscache_n_retrievals_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_read_or_alloc_pages); - -/* - * allocate a block in the cache on which to store a page - * - we return: - * -ENOMEM - out of memory, nothing done - * -ERESTARTSYS - interrupted - * -ENOBUFS - no backing object available in which to cache the block - * 0 - block allocated - */ -int __fscache_alloc_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp) -{ - struct fscache_retrieval *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,%p,,,", cookie, page); - - fscache_stat(&fscache_n_allocs); - - if (hlist_empty(&cookie->backing_objects)) - goto nobufs; - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(page, !=, NULL); - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - if (fscache_wait_for_deferred_lookup(cookie) < 0) - return -ERESTARTSYS; - - op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL); - if (!op) - return -ENOMEM; - atomic_set(&op->n_pages, 1); - trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one); - - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs_unlock; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - __fscache_use_cookie(cookie); - if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock_dec; - spin_unlock(&cookie->lock); - - fscache_stat(&fscache_n_alloc_ops); - - ret = fscache_wait_for_operation_activation( - object, &op->op, - __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead)); - if (ret < 0) - goto error; - - /* ask the cache to honour the operation */ - fscache_stat(&fscache_n_cop_allocate_page); - ret = object->cache->ops->allocate_page(op, page, gfp); - fscache_stat_d(&fscache_n_cop_allocate_page); - -error: - if (ret == -ERESTARTSYS) - fscache_stat(&fscache_n_allocs_intr); - else if (ret < 0) - fscache_stat(&fscache_n_allocs_nobufs); - else - fscache_stat(&fscache_n_allocs_ok); - - fscache_put_retrieval(op); - _leave(" = %d", ret); - return ret; - -nobufs_unlock_dec: - wake_cookie = __fscache_unuse_cookie(cookie); -nobufs_unlock: - spin_unlock(&cookie->lock); - fscache_put_retrieval(op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); -nobufs: - fscache_stat(&fscache_n_allocs_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; -} -EXPORT_SYMBOL(__fscache_alloc_page); - -/* - * Unmark pages allocate in the readahead code path (via: - * fscache_readpages_or_alloc) after delegating to the base filesystem - */ -void __fscache_readpages_cancel(struct fscache_cookie *cookie, - struct list_head *pages) -{ - struct page *page; - - list_for_each_entry(page, pages, lru) { - if (PageFsCache(page)) - __fscache_uncache_page(cookie, page); - } -} -EXPORT_SYMBOL(__fscache_readpages_cancel); - -/* - * release a write op reference - */ -static void fscache_release_write_op(struct fscache_operation *_op) -{ - _enter("{OP%x}", _op->debug_id); -} - -/* - * perform the background storage of a page into the cache - */ -static void fscache_write_op(struct fscache_operation *_op) -{ - struct fscache_storage *op = - container_of(_op, struct fscache_storage, op); - struct fscache_object *object = op->op.object; - struct fscache_cookie *cookie; - struct page *page; - unsigned n; - void *results[1]; - int ret; - - _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); - -again: - spin_lock(&object->lock); - cookie = object->cookie; - - if (!fscache_object_is_active(object)) { - /* If we get here, then the on-disk cache object likely no - * longer exists, so we should just cancel this write - * operation. - */ - spin_unlock(&object->lock); - fscache_op_complete(&op->op, true); - _leave(" [inactive]"); - return; - } - - if (!cookie) { - /* If we get here, then the cookie belonging to the object was - * detached, probably by the cookie being withdrawn due to - * memory pressure, which means that the pages we might write - * to the cache from no longer exist - therefore, we can just - * cancel this write operation. - */ - spin_unlock(&object->lock); - fscache_op_complete(&op->op, true); - _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", - _op->flags, _op->state, object->state->short_name, - object->flags); - return; - } - - spin_lock(&cookie->stores_lock); - - fscache_stat(&fscache_n_store_calls); - - /* find a page to store */ - results[0] = NULL; - page = NULL; - n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, - FSCACHE_COOKIE_PENDING_TAG); - trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit); - if (n != 1) - goto superseded; - page = results[0]; - _debug("gang %d [%lx]", n, page->index); - - radix_tree_tag_set(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG); - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); - trace_fscache_page(cookie, page, fscache_page_radix_pend2store); - - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); - - if (page->index >= op->store_limit) - goto discard_page; - - fscache_stat(&fscache_n_store_pages); - fscache_stat(&fscache_n_cop_write_page); - ret = object->cache->ops->write_page(op, page); - fscache_stat_d(&fscache_n_cop_write_page); - trace_fscache_wrote_page(cookie, page, &op->op, ret); - fscache_end_page_write(object, page); - if (ret < 0) { - fscache_abort_object(object); - fscache_op_complete(&op->op, true); - } else { - fscache_enqueue_operation(&op->op); - } - - _leave(""); - return; - -discard_page: - fscache_stat(&fscache_n_store_pages_over_limit); - trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS); - fscache_end_page_write(object, page); - goto again; - -superseded: - /* this writer is going away and there aren't any more things to - * write */ - _debug("cease"); - spin_unlock(&cookie->stores_lock); - clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - spin_unlock(&object->lock); - fscache_op_complete(&op->op, false); - _leave(""); -} - -/* - * Clear the pages pending writing for invalidation - */ -void fscache_invalidate_writes(struct fscache_cookie *cookie) -{ - struct page *page; - void *results[16]; - int n, i; - - _enter(""); - - for (;;) { - spin_lock(&cookie->stores_lock); - n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, - ARRAY_SIZE(results), - FSCACHE_COOKIE_PENDING_TAG); - if (n == 0) { - spin_unlock(&cookie->stores_lock); - break; - } - - for (i = n - 1; i >= 0; i--) { - page = results[i]; - radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - trace_fscache_page(cookie, page, fscache_page_inval); - } - - spin_unlock(&cookie->stores_lock); - - for (i = n - 1; i >= 0; i--) - put_page(results[i]); - } - - wake_up_bit(&cookie->flags, 0); - trace_fscache_wake_cookie(cookie); - - _leave(""); -} - -/* - * request a page be stored in the cache - * - returns: - * -ENOMEM - out of memory, nothing done - * -ENOBUFS - no backing object available in which to cache the page - * 0 - dispatched a write - it'll call end_io_func() when finished - * - * if the cookie still has a backing object at this point, that object can be - * in one of a few states with respect to storage processing: - * - * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is - * set) - * - * (a) no writes yet - * - * (b) writes deferred till post-creation (mark page for writing and - * return immediately) - * - * (2) negative lookup, object created, initial fill being made from netfs - * - * (a) fill point not yet reached this page (mark page for writing and - * return) - * - * (b) fill point passed this page (queue op to store this page) - * - * (3) object extant (queue op to store this page) - * - * any other state is invalid - */ -int __fscache_write_page(struct fscache_cookie *cookie, - struct page *page, - loff_t object_size, - gfp_t gfp) -{ - struct fscache_storage *op; - struct fscache_object *object; - bool wake_cookie = false; - int ret; - - _enter("%p,%x,", cookie, (u32) page->flags); - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERT(PageFsCache(page)); - - fscache_stat(&fscache_n_stores); - - if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { - _leave(" = -ENOBUFS [invalidating]"); - return -ENOBUFS; - } - - op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!op) - goto nomem; - - fscache_operation_init(cookie, &op->op, fscache_write_op, NULL, - fscache_release_write_op); - op->op.flags = FSCACHE_OP_ASYNC | - (1 << FSCACHE_OP_WAITING) | - (1 << FSCACHE_OP_UNUSE_COOKIE); - - ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); - if (ret < 0) - goto nomem_free; - - trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one); - - ret = -ENOBUFS; - spin_lock(&cookie->lock); - - if (!fscache_cookie_enabled(cookie) || - hlist_empty(&cookie->backing_objects)) - goto nobufs; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) - goto nobufs; - - trace_fscache_page(cookie, page, fscache_page_write); - - /* add the page to the pending-storage radix tree on the backing - * object */ - spin_lock(&object->lock); - - if (object->store_limit_l != object_size) - fscache_set_store_limit(object, object_size); - - spin_lock(&cookie->stores_lock); - - _debug("store limit %llx", (unsigned long long) object->store_limit); - - ret = radix_tree_insert(&cookie->stores, page->index, page); - if (ret < 0) { - if (ret == -EEXIST) - goto already_queued; - _debug("insert failed %d", ret); - goto nobufs_unlock_obj; - } - - trace_fscache_page(cookie, page, fscache_page_radix_insert); - radix_tree_tag_set(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); - trace_fscache_page(cookie, page, fscache_page_radix_set_pend); - get_page(page); - - /* we only want one writer at a time, but we do need to queue new - * writers after exclusive ops */ - if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) - goto already_pending; - - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); - - op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); - op->store_limit = object->store_limit; - - __fscache_use_cookie(cookie); - if (fscache_submit_op(object, &op->op) < 0) - goto submit_failed; - - spin_unlock(&cookie->lock); - radix_tree_preload_end(); - fscache_stat(&fscache_n_store_ops); - fscache_stat(&fscache_n_stores_ok); - - /* the work queue now carries its own ref on the object */ - fscache_put_operation(&op->op); - _leave(" = 0"); - return 0; - -already_queued: - fscache_stat(&fscache_n_stores_again); -already_pending: - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); - spin_unlock(&cookie->lock); - radix_tree_preload_end(); - fscache_put_operation(&op->op); - fscache_stat(&fscache_n_stores_ok); - _leave(" = 0"); - return 0; - -submit_failed: - spin_lock(&cookie->stores_lock); - radix_tree_delete(&cookie->stores, page->index); - trace_fscache_page(cookie, page, fscache_page_radix_delete); - spin_unlock(&cookie->stores_lock); - wake_cookie = __fscache_unuse_cookie(cookie); - put_page(page); - ret = -ENOBUFS; - goto nobufs; - -nobufs_unlock_obj: - spin_unlock(&cookie->stores_lock); - spin_unlock(&object->lock); -nobufs: - spin_unlock(&cookie->lock); - radix_tree_preload_end(); - fscache_put_operation(&op->op); - if (wake_cookie) - __fscache_wake_unused_cookie(cookie); - fscache_stat(&fscache_n_stores_nobufs); - _leave(" = -ENOBUFS"); - return -ENOBUFS; - -nomem_free: - fscache_put_operation(&op->op); -nomem: - fscache_stat(&fscache_n_stores_oom); - _leave(" = -ENOMEM"); - return -ENOMEM; -} -EXPORT_SYMBOL(__fscache_write_page); - -/* - * remove a page from the cache - */ -void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) -{ - struct fscache_object *object; - - _enter(",%p", page); - - ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); - ASSERTCMP(page, !=, NULL); - - fscache_stat(&fscache_n_uncaches); - - /* cache withdrawal may beat us to it */ - if (!PageFsCache(page)) - goto done; - - trace_fscache_page(cookie, page, fscache_page_uncache); - - /* get the object */ - spin_lock(&cookie->lock); - - if (hlist_empty(&cookie->backing_objects)) { - ClearPageFsCache(page); - goto done_unlock; - } - - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, cookie_link); - - /* there might now be stuff on disk we could read */ - clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - - /* only invoke the cache backend if we managed to mark the page - * uncached here; this deals with synchronisation vs withdrawal */ - if (TestClearPageFsCache(page) && - object->cache->ops->uncache_page) { - /* the cache backend releases the cookie lock */ - fscache_stat(&fscache_n_cop_uncache_page); - object->cache->ops->uncache_page(object, page); - fscache_stat_d(&fscache_n_cop_uncache_page); - goto done; - } - -done_unlock: - spin_unlock(&cookie->lock); -done: - _leave(""); -} -EXPORT_SYMBOL(__fscache_uncache_page); - -/** - * fscache_mark_page_cached - Mark a page as being cached - * @op: The retrieval op pages are being marked for - * @page: The page to be marked - * - * Mark a netfs page as being cached. After this is called, the netfs - * must call fscache_uncache_page() to remove the mark. - */ -void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) -{ - struct fscache_cookie *cookie = op->op.object->cookie; - -#ifdef CONFIG_FSCACHE_STATS - atomic_inc(&fscache_n_marks); -#endif - - trace_fscache_page(cookie, page, fscache_page_cached); - - _debug("- mark %p{%lx}", page, page->index); - if (TestSetPageFsCache(page)) { - static bool once_only; - if (!once_only) { - once_only = true; - pr_warn("Cookie type %s marked page %lx multiple times\n", - cookie->def->name, page->index); - } - } - - if (cookie->def->mark_page_cached) - cookie->def->mark_page_cached(cookie->netfs_data, - op->mapping, page); -} -EXPORT_SYMBOL(fscache_mark_page_cached); - -/** - * fscache_mark_pages_cached - Mark pages as being cached - * @op: The retrieval op pages are being marked for - * @pagevec: The pages to be marked - * - * Mark a bunch of netfs pages as being cached. After this is called, - * the netfs must call fscache_uncache_page() to remove the mark. - */ -void fscache_mark_pages_cached(struct fscache_retrieval *op, - struct pagevec *pagevec) -{ - unsigned long loop; - - for (loop = 0; loop < pagevec->nr; loop++) - fscache_mark_page_cached(op, pagevec->pages[loop]); - - pagevec_reinit(pagevec); -} -EXPORT_SYMBOL(fscache_mark_pages_cached); - -/* - * Uncache all the pages in an inode that are marked PG_fscache, assuming them - * to be associated with the given cookie. - */ -void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, - struct inode *inode) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - pgoff_t next; - int i; - - _enter("%p,%p", cookie, inode); - - if (!mapping || mapping->nrpages == 0) { - _leave(" [no pages]"); - return; - } - - pagevec_init(&pvec); - next = 0; - do { - if (!pagevec_lookup(&pvec, mapping, &next)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (PageFsCache(page)) { - __fscache_wait_on_page_write(cookie, page); - __fscache_uncache_page(cookie, page); - } - } - pagevec_release(&pvec); - cond_resched(); - } while (next); - - _leave(""); -} -EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 061df8f61ffc..dc3b0e9c8cce 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache statistics viewing interface * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -#define FSCACHE_DEBUG_LEVEL OPERATION +#define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -16,42 +16,32 @@ */ int __init fscache_proc_init(void) { - _enter(""); - if (!proc_mkdir("fs/fscache", NULL)) goto error_dir; + if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL, + &fscache_caches_seq_ops)) + goto error; + + if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL, + &fscache_volumes_seq_ops)) + goto error; + if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, &fscache_cookies_seq_ops)) - goto error_cookies; + goto error; #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, - fscache_stats_show)) - goto error_stats; + fscache_stats_show)) + goto error; #endif -#ifdef CONFIG_FSCACHE_OBJECT_LIST - if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, - &fscache_objlist_proc_ops)) - goto error_objects; -#endif - - _leave(" = 0"); return 0; -#ifdef CONFIG_FSCACHE_OBJECT_LIST -error_objects: -#endif -#ifdef CONFIG_FSCACHE_STATS - remove_proc_entry("fs/fscache/stats", NULL); -error_stats: -#endif - remove_proc_entry("fs/fscache/cookies", NULL); -error_cookies: +error: remove_proc_entry("fs/fscache", NULL); error_dir: - _leave(" = -ENOMEM"); return -ENOMEM; } @@ -60,12 +50,5 @@ error_dir: */ void fscache_proc_cleanup(void) { -#ifdef CONFIG_FSCACHE_OBJECT_LIST - remove_proc_entry("fs/fscache/objects", NULL); -#endif -#ifdef CONFIG_FSCACHE_STATS - remove_proc_entry("fs/fscache/stats", NULL); -#endif - remove_proc_entry("fs/fscache/cookies", NULL); - remove_proc_entry("fs/fscache", NULL); + remove_proc_subtree("fs/fscache", NULL); } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index a7c3ed89a3e0..fc94e5e79f1c 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache statistics * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ -#define FSCACHE_DEBUG_LEVEL THREAD -#include <linux/module.h> +#define FSCACHE_DEBUG_LEVEL CACHE #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" @@ -14,122 +13,41 @@ /* * operation counters */ -atomic_t fscache_n_op_pend; -atomic_t fscache_n_op_run; -atomic_t fscache_n_op_enqueue; -atomic_t fscache_n_op_deferred_release; -atomic_t fscache_n_op_initialised; -atomic_t fscache_n_op_release; -atomic_t fscache_n_op_gc; -atomic_t fscache_n_op_cancelled; -atomic_t fscache_n_op_rejected; - -atomic_t fscache_n_attr_changed; -atomic_t fscache_n_attr_changed_ok; -atomic_t fscache_n_attr_changed_nobufs; -atomic_t fscache_n_attr_changed_nomem; -atomic_t fscache_n_attr_changed_calls; - -atomic_t fscache_n_allocs; -atomic_t fscache_n_allocs_ok; -atomic_t fscache_n_allocs_wait; -atomic_t fscache_n_allocs_nobufs; -atomic_t fscache_n_allocs_intr; -atomic_t fscache_n_allocs_object_dead; -atomic_t fscache_n_alloc_ops; -atomic_t fscache_n_alloc_op_waits; - -atomic_t fscache_n_retrievals; -atomic_t fscache_n_retrievals_ok; -atomic_t fscache_n_retrievals_wait; -atomic_t fscache_n_retrievals_nodata; -atomic_t fscache_n_retrievals_nobufs; -atomic_t fscache_n_retrievals_intr; -atomic_t fscache_n_retrievals_nomem; -atomic_t fscache_n_retrievals_object_dead; -atomic_t fscache_n_retrieval_ops; -atomic_t fscache_n_retrieval_op_waits; - -atomic_t fscache_n_stores; -atomic_t fscache_n_stores_ok; -atomic_t fscache_n_stores_again; -atomic_t fscache_n_stores_nobufs; -atomic_t fscache_n_stores_oom; -atomic_t fscache_n_store_ops; -atomic_t fscache_n_store_calls; -atomic_t fscache_n_store_pages; -atomic_t fscache_n_store_radix_deletes; -atomic_t fscache_n_store_pages_over_limit; - -atomic_t fscache_n_store_vmscan_not_storing; -atomic_t fscache_n_store_vmscan_gone; -atomic_t fscache_n_store_vmscan_busy; -atomic_t fscache_n_store_vmscan_cancelled; -atomic_t fscache_n_store_vmscan_wait; - -atomic_t fscache_n_marks; -atomic_t fscache_n_uncaches; +atomic_t fscache_n_volumes; +atomic_t fscache_n_volumes_collision; +atomic_t fscache_n_volumes_nomem; +atomic_t fscache_n_cookies; +atomic_t fscache_n_cookies_lru; +atomic_t fscache_n_cookies_lru_expired; +atomic_t fscache_n_cookies_lru_removed; +atomic_t fscache_n_cookies_lru_dropped; atomic_t fscache_n_acquires; -atomic_t fscache_n_acquires_null; -atomic_t fscache_n_acquires_no_cache; atomic_t fscache_n_acquires_ok; -atomic_t fscache_n_acquires_nobufs; atomic_t fscache_n_acquires_oom; atomic_t fscache_n_invalidates; -atomic_t fscache_n_invalidates_run; atomic_t fscache_n_updates; -atomic_t fscache_n_updates_null; -atomic_t fscache_n_updates_run; +EXPORT_SYMBOL(fscache_n_updates); atomic_t fscache_n_relinquishes; -atomic_t fscache_n_relinquishes_null; -atomic_t fscache_n_relinquishes_waitcrt; atomic_t fscache_n_relinquishes_retire; - -atomic_t fscache_n_cookie_index; -atomic_t fscache_n_cookie_data; -atomic_t fscache_n_cookie_special; - -atomic_t fscache_n_object_alloc; -atomic_t fscache_n_object_no_alloc; -atomic_t fscache_n_object_lookups; -atomic_t fscache_n_object_lookups_negative; -atomic_t fscache_n_object_lookups_positive; -atomic_t fscache_n_object_lookups_timed_out; -atomic_t fscache_n_object_created; -atomic_t fscache_n_object_avail; -atomic_t fscache_n_object_dead; - -atomic_t fscache_n_checkaux_none; -atomic_t fscache_n_checkaux_okay; -atomic_t fscache_n_checkaux_update; -atomic_t fscache_n_checkaux_obsolete; - -atomic_t fscache_n_cop_alloc_object; -atomic_t fscache_n_cop_lookup_object; -atomic_t fscache_n_cop_lookup_complete; -atomic_t fscache_n_cop_grab_object; -atomic_t fscache_n_cop_invalidate_object; -atomic_t fscache_n_cop_update_object; -atomic_t fscache_n_cop_drop_object; -atomic_t fscache_n_cop_put_object; -atomic_t fscache_n_cop_sync_cache; -atomic_t fscache_n_cop_attr_changed; -atomic_t fscache_n_cop_read_or_alloc_page; -atomic_t fscache_n_cop_read_or_alloc_pages; -atomic_t fscache_n_cop_allocate_page; -atomic_t fscache_n_cop_allocate_pages; -atomic_t fscache_n_cop_write_page; -atomic_t fscache_n_cop_uncache_page; -atomic_t fscache_n_cop_dissociate_pages; - -atomic_t fscache_n_cache_no_space_reject; -atomic_t fscache_n_cache_stale_objects; -atomic_t fscache_n_cache_retired_objects; -atomic_t fscache_n_cache_culled_objects; +atomic_t fscache_n_relinquishes_dropped; + +atomic_t fscache_n_resizes; +atomic_t fscache_n_resizes_null; + +atomic_t fscache_n_read; +EXPORT_SYMBOL(fscache_n_read); +atomic_t fscache_n_write; +EXPORT_SYMBOL(fscache_n_write); +atomic_t fscache_n_no_write_space; +EXPORT_SYMBOL(fscache_n_no_write_space); +atomic_t fscache_n_no_create_space; +EXPORT_SYMBOL(fscache_n_no_create_space); +atomic_t fscache_n_culled; +EXPORT_SYMBOL(fscache_n_culled); /* * display the general statistics @@ -137,147 +55,48 @@ atomic_t fscache_n_cache_culled_objects; int fscache_stats_show(struct seq_file *m, void *v) { seq_puts(m, "FS-Cache statistics\n"); - - seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n", - atomic_read(&fscache_n_cookie_index), - atomic_read(&fscache_n_cookie_data), - atomic_read(&fscache_n_cookie_special)); - - seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n", - atomic_read(&fscache_n_object_alloc), - atomic_read(&fscache_n_object_no_alloc), - atomic_read(&fscache_n_object_avail), - atomic_read(&fscache_n_object_dead)); - seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n", - atomic_read(&fscache_n_checkaux_none), - atomic_read(&fscache_n_checkaux_okay), - atomic_read(&fscache_n_checkaux_update), - atomic_read(&fscache_n_checkaux_obsolete)); - - seq_printf(m, "Pages : mrk=%u unc=%u\n", - atomic_read(&fscache_n_marks), - atomic_read(&fscache_n_uncaches)); - - seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u" - " oom=%u\n", + seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", + atomic_read(&fscache_n_cookies), + atomic_read(&fscache_n_volumes), + atomic_read(&fscache_n_volumes_collision), + atomic_read(&fscache_n_volumes_nomem) + ); + + seq_printf(m, "Acquire: n=%u ok=%u oom=%u\n", atomic_read(&fscache_n_acquires), - atomic_read(&fscache_n_acquires_null), - atomic_read(&fscache_n_acquires_no_cache), atomic_read(&fscache_n_acquires_ok), - atomic_read(&fscache_n_acquires_nobufs), atomic_read(&fscache_n_acquires_oom)); - seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n", - atomic_read(&fscache_n_object_lookups), - atomic_read(&fscache_n_object_lookups_negative), - atomic_read(&fscache_n_object_lookups_positive), - atomic_read(&fscache_n_object_created), - atomic_read(&fscache_n_object_lookups_timed_out)); + seq_printf(m, "LRU : n=%u exp=%u rmv=%u drp=%u at=%ld\n", + atomic_read(&fscache_n_cookies_lru), + atomic_read(&fscache_n_cookies_lru_expired), + atomic_read(&fscache_n_cookies_lru_removed), + atomic_read(&fscache_n_cookies_lru_dropped), + timer_pending(&fscache_cookie_lru_timer) ? + fscache_cookie_lru_timer.expires - jiffies : 0); - seq_printf(m, "Invals : n=%u run=%u\n", - atomic_read(&fscache_n_invalidates), - atomic_read(&fscache_n_invalidates_run)); + seq_printf(m, "Invals : n=%u\n", + atomic_read(&fscache_n_invalidates)); - seq_printf(m, "Updates: n=%u nul=%u run=%u\n", + seq_printf(m, "Updates: n=%u rsz=%u rsn=%u\n", atomic_read(&fscache_n_updates), - atomic_read(&fscache_n_updates_null), - atomic_read(&fscache_n_updates_run)); + atomic_read(&fscache_n_resizes), + atomic_read(&fscache_n_resizes_null)); - seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n", + seq_printf(m, "Relinqs: n=%u rtr=%u drop=%u\n", atomic_read(&fscache_n_relinquishes), - atomic_read(&fscache_n_relinquishes_null), - atomic_read(&fscache_n_relinquishes_waitcrt), - atomic_read(&fscache_n_relinquishes_retire)); - - seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", - atomic_read(&fscache_n_attr_changed), - atomic_read(&fscache_n_attr_changed_ok), - atomic_read(&fscache_n_attr_changed_nobufs), - atomic_read(&fscache_n_attr_changed_nomem), - atomic_read(&fscache_n_attr_changed_calls)); - - seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n", - atomic_read(&fscache_n_allocs), - atomic_read(&fscache_n_allocs_ok), - atomic_read(&fscache_n_allocs_wait), - atomic_read(&fscache_n_allocs_nobufs), - atomic_read(&fscache_n_allocs_intr)); - seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n", - atomic_read(&fscache_n_alloc_ops), - atomic_read(&fscache_n_alloc_op_waits), - atomic_read(&fscache_n_allocs_object_dead)); - - seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" - " int=%u oom=%u\n", - atomic_read(&fscache_n_retrievals), - atomic_read(&fscache_n_retrievals_ok), - atomic_read(&fscache_n_retrievals_wait), - atomic_read(&fscache_n_retrievals_nodata), - atomic_read(&fscache_n_retrievals_nobufs), - atomic_read(&fscache_n_retrievals_intr), - atomic_read(&fscache_n_retrievals_nomem)); - seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n", - atomic_read(&fscache_n_retrieval_ops), - atomic_read(&fscache_n_retrieval_op_waits), - atomic_read(&fscache_n_retrievals_object_dead)); - - seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", - atomic_read(&fscache_n_stores), - atomic_read(&fscache_n_stores_ok), - atomic_read(&fscache_n_stores_again), - atomic_read(&fscache_n_stores_nobufs), - atomic_read(&fscache_n_stores_oom)); - seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n", - atomic_read(&fscache_n_store_ops), - atomic_read(&fscache_n_store_calls), - atomic_read(&fscache_n_store_pages), - atomic_read(&fscache_n_store_radix_deletes), - atomic_read(&fscache_n_store_pages_over_limit)); + atomic_read(&fscache_n_relinquishes_retire), + atomic_read(&fscache_n_relinquishes_dropped)); - seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n", - atomic_read(&fscache_n_store_vmscan_not_storing), - atomic_read(&fscache_n_store_vmscan_gone), - atomic_read(&fscache_n_store_vmscan_busy), - atomic_read(&fscache_n_store_vmscan_cancelled), - atomic_read(&fscache_n_store_vmscan_wait)); + seq_printf(m, "NoSpace: nwr=%u ncr=%u cull=%u\n", + atomic_read(&fscache_n_no_write_space), + atomic_read(&fscache_n_no_create_space), + atomic_read(&fscache_n_culled)); - seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", - atomic_read(&fscache_n_op_pend), - atomic_read(&fscache_n_op_run), - atomic_read(&fscache_n_op_enqueue), - atomic_read(&fscache_n_op_cancelled), - atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", - atomic_read(&fscache_n_op_initialised), - atomic_read(&fscache_n_op_deferred_release), - atomic_read(&fscache_n_op_release), - atomic_read(&fscache_n_op_gc)); + seq_printf(m, "IO : rd=%u wr=%u\n", + atomic_read(&fscache_n_read), + atomic_read(&fscache_n_write)); - seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n", - atomic_read(&fscache_n_cop_alloc_object), - atomic_read(&fscache_n_cop_lookup_object), - atomic_read(&fscache_n_cop_lookup_complete), - atomic_read(&fscache_n_cop_grab_object)); - seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n", - atomic_read(&fscache_n_cop_invalidate_object), - atomic_read(&fscache_n_cop_update_object), - atomic_read(&fscache_n_cop_drop_object), - atomic_read(&fscache_n_cop_put_object), - atomic_read(&fscache_n_cop_attr_changed), - atomic_read(&fscache_n_cop_sync_cache)); - seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n", - atomic_read(&fscache_n_cop_read_or_alloc_page), - atomic_read(&fscache_n_cop_read_or_alloc_pages), - atomic_read(&fscache_n_cop_allocate_page), - atomic_read(&fscache_n_cop_allocate_pages), - atomic_read(&fscache_n_cop_write_page), - atomic_read(&fscache_n_cop_uncache_page), - atomic_read(&fscache_n_cop_dissociate_pages)); - seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", - atomic_read(&fscache_n_cache_no_space_reject), - atomic_read(&fscache_n_cache_stale_objects), - atomic_read(&fscache_n_cache_retired_objects), - atomic_read(&fscache_n_cache_culled_objects)); netfs_stats_show(m); return 0; } diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c new file mode 100644 index 000000000000..f2aa7dbad766 --- /dev/null +++ b/fs/fscache/volume.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Volume-level cache cookie handling. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include <linux/export.h> +#include <linux/slab.h> +#include "internal.h" + +#define fscache_volume_hash_shift 10 +static struct hlist_bl_head fscache_volume_hash[1 << fscache_volume_hash_shift]; +static atomic_t fscache_volume_debug_id; +static LIST_HEAD(fscache_volumes); + +static void fscache_create_volume_work(struct work_struct *work); + +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + int ref; + + __refcount_inc(&volume->ref, &ref); + trace_fscache_volume(volume->debug_id, ref + 1, where); + return volume; +} + +static void fscache_see_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + int ref = refcount_read(&volume->ref); + + trace_fscache_volume(volume->debug_id, ref, where); +} + +/* + * Pin the cache behind a volume so that we can access it. + */ +static void __fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + n_accesses = atomic_inc_return(&volume->n_accesses); + smp_mb__after_atomic(); + trace_fscache_access_volume(volume->debug_id, cookie ? cookie->debug_id : 0, + refcount_read(&volume->ref), + n_accesses, why); +} + +/** + * fscache_begin_volume_access - Pin a cache so a volume can be accessed + * @volume: The volume cookie + * @cookie: A datafile cookie for a tracing reference (or NULL) + * @why: An indication of the circumstances of the access for tracing + * + * Attempt to pin the cache to prevent it from going away whilst we're + * accessing a volume and returns true if successful. This works as follows: + * + * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE), + * then we return false to indicate access was not permitted. + * + * (2) If the cache tests as live, then we increment the volume's n_accesses + * count and then recheck the cache liveness, ending the access if it + * ceased to be live. + * + * (3) When we end the access, we decrement the volume's n_accesses and wake + * up the any waiters if it reaches 0. + * + * (4) Whilst the cache is caching, the volume's n_accesses is kept + * artificially incremented to prevent wakeups from happening. + * + * (5) When the cache is taken offline, the state is changed to prevent new + * accesses, the volume's n_accesses is decremented and we wait for it to + * become 0. + * + * The datafile @cookie and the @why indicator are merely provided for tracing + * purposes. + */ +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + if (!fscache_cache_is_live(volume->cache)) + return false; + __fscache_begin_volume_access(volume, cookie, why); + if (!fscache_cache_is_live(volume->cache)) { + fscache_end_volume_access(volume, cookie, fscache_access_unlive); + return false; + } + return true; +} + +/** + * fscache_end_volume_access - Unpin a cache at the end of an access. + * @volume: The volume cookie + * @cookie: A datafile cookie for a tracing reference (or NULL) + * @why: An indication of the circumstances of the access for tracing + * + * Unpin a cache volume after we've accessed it. The datafile @cookie and the + * @why indicator are merely provided for tracing purposes. + */ +void fscache_end_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why) +{ + int n_accesses; + + smp_mb__before_atomic(); + n_accesses = atomic_dec_return(&volume->n_accesses); + trace_fscache_access_volume(volume->debug_id, cookie ? cookie->debug_id : 0, + refcount_read(&volume->ref), + n_accesses, why); + if (n_accesses == 0) + wake_up_var(&volume->n_accesses); +} +EXPORT_SYMBOL(fscache_end_volume_access); + +static bool fscache_volume_same(const struct fscache_volume *a, + const struct fscache_volume *b) +{ + size_t klen; + + if (a->key_hash != b->key_hash || + a->cache != b->cache || + a->key[0] != b->key[0]) + return false; + + klen = round_up(a->key[0] + 1, sizeof(__le32)); + return memcmp(a->key, b->key, klen) == 0; +} + +static bool fscache_is_acquire_pending(struct fscache_volume *volume) +{ + return test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &volume->flags); +} + +static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, + unsigned int collidee_debug_id) +{ + wait_var_event_timeout(&candidate->flags, + !fscache_is_acquire_pending(candidate), 20 * HZ); + if (!fscache_is_acquire_pending(candidate)) { + pr_notice("Potential volume collision new=%08x old=%08x", + candidate->debug_id, collidee_debug_id); + fscache_stat(&fscache_n_volumes_collision); + wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); + } +} + +/* + * Attempt to insert the new volume into the hash. If there's a collision, we + * wait for the old volume to complete if it's being relinquished and an error + * otherwise. + */ +static bool fscache_hash_volume(struct fscache_volume *candidate) +{ + struct fscache_volume *cursor; + struct hlist_bl_head *h; + struct hlist_bl_node *p; + unsigned int bucket, collidee_debug_id = 0; + + bucket = candidate->key_hash & (ARRAY_SIZE(fscache_volume_hash) - 1); + h = &fscache_volume_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_for_each_entry(cursor, p, h, hash_link) { + if (fscache_volume_same(candidate, cursor)) { + if (!test_bit(FSCACHE_VOLUME_RELINQUISHED, &cursor->flags)) + goto collision; + fscache_see_volume(cursor, fscache_volume_get_hash_collision); + set_bit(FSCACHE_VOLUME_COLLIDED_WITH, &cursor->flags); + set_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags); + collidee_debug_id = cursor->debug_id; + break; + } + } + + hlist_bl_add_head(&candidate->hash_link, h); + hlist_bl_unlock(h); + + if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags)) + fscache_wait_on_volume_collision(candidate, collidee_debug_id); + return true; + +collision: + fscache_see_volume(cursor, fscache_volume_collision); + hlist_bl_unlock(h); + return false; +} + +/* + * Allocate and initialise a volume representation cookie. + */ +static struct fscache_volume *fscache_alloc_volume(const char *volume_key, + const char *cache_name, + const void *coherency_data, + size_t coherency_len) +{ + struct fscache_volume *volume; + struct fscache_cache *cache; + size_t klen, hlen; + char *key; + + if (!coherency_data) + coherency_len = 0; + + cache = fscache_lookup_cache(cache_name, false); + if (IS_ERR(cache)) + return NULL; + + volume = kzalloc(struct_size(volume, coherency, coherency_len), + GFP_KERNEL); + if (!volume) + goto err_cache; + + volume->cache = cache; + volume->coherency_len = coherency_len; + if (coherency_data) + memcpy(volume->coherency, coherency_data, coherency_len); + INIT_LIST_HEAD(&volume->proc_link); + INIT_WORK(&volume->work, fscache_create_volume_work); + refcount_set(&volume->ref, 1); + spin_lock_init(&volume->lock); + + /* Stick the length on the front of the key and pad it out to make + * hashing easier. + */ + klen = strlen(volume_key); + hlen = round_up(1 + klen + 1, sizeof(__le32)); + key = kzalloc(hlen, GFP_KERNEL); + if (!key) + goto err_vol; + key[0] = klen; + memcpy(key + 1, volume_key, klen); + + volume->key = key; + volume->key_hash = fscache_hash(0, key, hlen); + + volume->debug_id = atomic_inc_return(&fscache_volume_debug_id); + down_write(&fscache_addremove_sem); + atomic_inc(&cache->n_volumes); + list_add_tail(&volume->proc_link, &fscache_volumes); + fscache_see_volume(volume, fscache_volume_new_acquire); + fscache_stat(&fscache_n_volumes); + up_write(&fscache_addremove_sem); + _leave(" = v=%x", volume->debug_id); + return volume; + +err_vol: + kfree(volume); +err_cache: + fscache_put_cache(cache, fscache_cache_put_alloc_volume); + fscache_stat(&fscache_n_volumes_nomem); + return NULL; +} + +/* + * Create a volume's representation on disk. Have a volume ref and a cache + * access we have to release. + */ +static void fscache_create_volume_work(struct work_struct *work) +{ + const struct fscache_cache_ops *ops; + struct fscache_volume *volume = + container_of(work, struct fscache_volume, work); + + fscache_see_volume(volume, fscache_volume_see_create_work); + + ops = volume->cache->ops; + if (ops->acquire_volume) + ops->acquire_volume(volume); + fscache_end_cache_access(volume->cache, + fscache_access_acquire_volume_end); + + clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); + wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + fscache_put_volume(volume, fscache_volume_put_create_work); +} + +/* + * Dispatch a worker thread to create a volume's representation on disk. + */ +void fscache_create_volume(struct fscache_volume *volume, bool wait) +{ + if (test_and_set_bit(FSCACHE_VOLUME_CREATING, &volume->flags)) + goto maybe_wait; + if (volume->cache_priv) + goto no_wait; /* We raced */ + if (!fscache_begin_cache_access(volume->cache, + fscache_access_acquire_volume)) + goto no_wait; + + fscache_get_volume(volume, fscache_volume_get_create_work); + if (!schedule_work(&volume->work)) + fscache_put_volume(volume, fscache_volume_put_create_work); + +maybe_wait: + if (wait) { + fscache_see_volume(volume, fscache_volume_wait_create_work); + wait_on_bit(&volume->flags, FSCACHE_VOLUME_CREATING, + TASK_UNINTERRUPTIBLE); + } + return; +no_wait: + clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); + wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); +} + +/* + * Acquire a volume representation cookie and link it to a (proposed) cache. + */ +struct fscache_volume *__fscache_acquire_volume(const char *volume_key, + const char *cache_name, + const void *coherency_data, + size_t coherency_len) +{ + struct fscache_volume *volume; + + volume = fscache_alloc_volume(volume_key, cache_name, + coherency_data, coherency_len); + if (!volume) + return ERR_PTR(-ENOMEM); + + if (!fscache_hash_volume(volume)) { + fscache_put_volume(volume, fscache_volume_put_hash_collision); + return ERR_PTR(-EBUSY); + } + + fscache_create_volume(volume, false); + return volume; +} +EXPORT_SYMBOL(__fscache_acquire_volume); + +static void fscache_wake_pending_volume(struct fscache_volume *volume, + struct hlist_bl_head *h) +{ + struct fscache_volume *cursor; + struct hlist_bl_node *p; + + hlist_bl_for_each_entry(cursor, p, h, hash_link) { + if (fscache_volume_same(cursor, volume)) { + fscache_see_volume(cursor, fscache_volume_see_hash_wake); + clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags); + wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING); + return; + } + } +} + +/* + * Remove a volume cookie from the hash table. + */ +static void fscache_unhash_volume(struct fscache_volume *volume) +{ + struct hlist_bl_head *h; + unsigned int bucket; + + bucket = volume->key_hash & (ARRAY_SIZE(fscache_volume_hash) - 1); + h = &fscache_volume_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_del(&volume->hash_link); + if (test_bit(FSCACHE_VOLUME_COLLIDED_WITH, &volume->flags)) + fscache_wake_pending_volume(volume, h); + hlist_bl_unlock(h); +} + +/* + * Drop a cache's volume attachments. + */ +static void fscache_free_volume(struct fscache_volume *volume) +{ + struct fscache_cache *cache = volume->cache; + + if (volume->cache_priv) { + __fscache_begin_volume_access(volume, NULL, + fscache_access_relinquish_volume); + if (volume->cache_priv) + cache->ops->free_volume(volume); + fscache_end_volume_access(volume, NULL, + fscache_access_relinquish_volume_end); + } + + down_write(&fscache_addremove_sem); + list_del_init(&volume->proc_link); + atomic_dec(&volume->cache->n_volumes); + up_write(&fscache_addremove_sem); + + if (!hlist_bl_unhashed(&volume->hash_link)) + fscache_unhash_volume(volume); + + trace_fscache_volume(volume->debug_id, 0, fscache_volume_free); + kfree(volume->key); + kfree(volume); + fscache_stat_d(&fscache_n_volumes); + fscache_put_cache(cache, fscache_cache_put_volume); +} + +/* + * Drop a reference to a volume cookie. + */ +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + if (volume) { + unsigned int debug_id = volume->debug_id; + bool zero; + int ref; + + zero = __refcount_dec_and_test(&volume->ref, &ref); + trace_fscache_volume(debug_id, ref - 1, where); + if (zero) + fscache_free_volume(volume); + } +} + +/* + * Relinquish a volume representation cookie. + */ +void __fscache_relinquish_volume(struct fscache_volume *volume, + const void *coherency_data, + bool invalidate) +{ + if (WARN_ON(test_and_set_bit(FSCACHE_VOLUME_RELINQUISHED, &volume->flags))) + return; + + if (invalidate) { + set_bit(FSCACHE_VOLUME_INVALIDATE, &volume->flags); + } else if (coherency_data) { + memcpy(volume->coherency, coherency_data, volume->coherency_len); + } + + fscache_put_volume(volume, fscache_volume_put_relinquish); +} +EXPORT_SYMBOL(__fscache_relinquish_volume); + +/** + * fscache_withdraw_volume - Withdraw a volume from being cached + * @volume: Volume cookie + * + * Withdraw a cache volume from service, waiting for all accesses to complete + * before returning. + */ +void fscache_withdraw_volume(struct fscache_volume *volume) +{ + int n_accesses; + + _debug("withdraw V=%x", volume->debug_id); + + /* Allow wakeups on dec-to-0 */ + n_accesses = atomic_dec_return(&volume->n_accesses); + trace_fscache_access_volume(volume->debug_id, 0, + refcount_read(&volume->ref), + n_accesses, fscache_access_cache_unpin); + + wait_var_event(&volume->n_accesses, + atomic_read(&volume->n_accesses) == 0); +} +EXPORT_SYMBOL(fscache_withdraw_volume); + +#ifdef CONFIG_PROC_FS +/* + * Generate a list of volumes in /proc/fs/fscache/volumes + */ +static int fscache_volumes_seq_show(struct seq_file *m, void *v) +{ + struct fscache_volume *volume; + + if (v == &fscache_volumes) { + seq_puts(m, + "VOLUME REF nCOOK ACC FL CACHE KEY\n" + "======== ===== ===== === == =============== ================\n"); + return 0; + } + + volume = list_entry(v, struct fscache_volume, proc_link); + seq_printf(m, + "%08x %5d %5d %3d %02lx %-15.15s %s\n", + volume->debug_id, + refcount_read(&volume->ref), + atomic_read(&volume->n_cookies), + atomic_read(&volume->n_accesses), + volume->flags, + volume->cache->name ?: "-", + volume->key + 1); + return 0; +} + +static void *fscache_volumes_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(&fscache_addremove_sem) +{ + down_read(&fscache_addremove_sem); + return seq_list_start_head(&fscache_volumes, *_pos); +} + +static void *fscache_volumes_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &fscache_volumes, _pos); +} + +static void fscache_volumes_seq_stop(struct seq_file *m, void *v) + __releases(&fscache_addremove_sem) +{ + up_read(&fscache_addremove_sem); +} + +const struct seq_operations fscache_volumes_seq_ops = { + .start = fscache_volumes_seq_start, + .next = fscache_volumes_seq_next, + .stop = fscache_volumes_seq_stop, + .show = fscache_volumes_seq_show, +}; +#endif /* CONFIG_PROC_FS */ diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 40ce9a1c12e5..038ed0b9aaa5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -45,7 +45,7 @@ config FUSE_DAX select INTERVAL_TREE depends on VIRTIO_FS depends on FS_DAX - depends on DAX_DRIVER + depends on DAX help This allows bypassing guest page cache and allows mapping host page cache directly in guest address space. diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 713818d74de6..182b24a14804 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1279,11 +1279,14 @@ out_err: return ret; } -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, + struct dax_device *dax_dev) { struct fuse_conn_dax *fcd; int err; + fc->dax_mode = dax_mode; + if (!dax_dev) return 0; @@ -1327,17 +1330,46 @@ static const struct address_space_operations fuse_dax_file_aops = { .invalidatepage = noop_invalidatepage, }; -void fuse_dax_inode_init(struct inode *inode) +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); + enum fuse_dax_mode dax_mode = fc->dax_mode; + + if (dax_mode == FUSE_DAX_NEVER) + return false; + /* + * fc->dax may be NULL in 'inode' mode when filesystem device doesn't + * support DAX, in which case it will silently fallback to 'never' mode. + */ if (!fc->dax) + return false; + + if (dax_mode == FUSE_DAX_ALWAYS) + return true; + + /* dax_mode is FUSE_DAX_INODE* */ + return fc->inode_dax && (flags & FUSE_ATTR_DAX); +} + +void fuse_dax_inode_init(struct inode *inode, unsigned int flags) +{ + if (!fuse_should_enable_dax(inode, flags)) return; inode->i_flags |= S_DAX; inode->i_data.a_ops = &fuse_dax_file_aops; } +void fuse_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_inode_dax_mode(fc->dax_mode) && + ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) + d_mark_dontcache(inode); +} + bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0654bfedcbb0..656e921f3506 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -17,6 +17,9 @@ #include <linux/xattr.h> #include <linux/iversion.h> #include <linux/posix_acl.h> +#include <linux/security.h> +#include <linux/types.h> +#include <linux/kernel.h> static void fuse_advise_use_readdirplus(struct inode *dir) { @@ -456,6 +459,62 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, return ERR_PTR(err); } +static int get_security_context(struct dentry *entry, umode_t mode, + void **security_ctx, u32 *security_ctxlen) +{ + struct fuse_secctx *fctx; + struct fuse_secctx_header *header; + void *ctx = NULL, *ptr; + u32 ctxlen, total_len = sizeof(*header); + int err, nr_ctx = 0; + const char *name; + size_t namelen; + + err = security_dentry_init_security(entry, mode, &entry->d_name, + &name, &ctx, &ctxlen); + if (err) { + if (err != -EOPNOTSUPP) + goto out_err; + /* No LSM is supporting this security hook. Ignore error */ + ctxlen = 0; + ctx = NULL; + } + + if (ctxlen) { + nr_ctx = 1; + namelen = strlen(name) + 1; + err = -EIO; + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + goto out_err; + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + } + + err = -ENOMEM; + header = ptr = kzalloc(total_len, GFP_KERNEL); + if (!ptr) + goto out_err; + + header->nr_secctx = nr_ctx; + header->size = total_len; + ptr += sizeof(*header); + if (nr_ctx) { + fctx = ptr; + fctx->size = ctxlen; + ptr += sizeof(*fctx); + + strcpy(ptr, name); + ptr += namelen; + + memcpy(ptr, ctx, ctxlen); + } + *security_ctxlen = total_len; + *security_ctx = header; + err = 0; +out_err: + kfree(ctx); + return err; +} + /* * Atomic create+open operation * @@ -476,6 +535,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + void *security_ctx = NULL; + u32 security_ctxlen; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -517,7 +578,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; + + if (fm->fc->init_security) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + args.in_numargs = 3; + args.in_args[2].size = security_ctxlen; + args.in_args[2].value = security_ctx; + } + err = fuse_simple_request(fm, &args); + kfree(security_ctx); if (err) goto out_free_ff; @@ -620,6 +694,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct dentry *d; int err; struct fuse_forget_link *forget; + void *security_ctx = NULL; + u32 security_ctxlen; if (fuse_is_bad(dir)) return -EIO; @@ -633,7 +709,22 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; + + if (fm->fc->init_security && args->opcode != FUSE_LINK) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + BUG_ON(args->in_numargs != 2); + + args->in_numargs = 3; + args->in_args[2].size = security_ctxlen; + args->in_args[2].value = security_ctx; + } + err = fuse_simple_request(fm, args); + kfree(security_ctx); if (err) goto out_put_forget_req; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d6c5f6361f7..829094451774 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2910,7 +2910,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); @@ -3169,7 +3169,7 @@ static const struct address_space_operations fuse_file_aops = { .write_end = fuse_write_end, }; -void fuse_init_file_inode(struct inode *inode) +void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -3183,5 +3183,5 @@ void fuse_init_file_inode(struct inode *inode) fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_inode_init(inode); + fuse_dax_inode_init(inode, flags); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 198637b41e19..e8e59fbdefeb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -480,6 +480,18 @@ struct fuse_dev { struct list_head entry; }; +enum fuse_dax_mode { + FUSE_DAX_INODE_DEFAULT, /* default */ + FUSE_DAX_ALWAYS, /* "-o dax=always" */ + FUSE_DAX_NEVER, /* "-o dax=never" */ + FUSE_DAX_INODE_USER, /* "-o dax=inode" */ +}; + +static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode) +{ + return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER; +} + struct fuse_fs_context { int fd; struct file *file; @@ -497,7 +509,7 @@ struct fuse_fs_context { bool no_control:1; bool no_force_umount:1; bool legacy_opts_show:1; - bool dax:1; + enum fuse_dax_mode dax_mode; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -765,6 +777,12 @@ struct fuse_conn { /* Propagate syncfs() to server */ unsigned int sync_fs:1; + /* Initialize security xattrs when creating a new inode */ + unsigned int init_security:1; + + /* Does the filesystem support per inode DAX? */ + unsigned int inode_dax:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -802,6 +820,9 @@ struct fuse_conn { struct list_head devices; #ifdef CONFIG_FUSE_DAX + /* Dax mode */ + enum fuse_dax_mode dax_mode; + /* Dax specific conn data, non-NULL if DAX is enabled */ struct fuse_conn_dax *dax; #endif @@ -1007,7 +1028,7 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, /** * Initialize file operations on a regular file */ -void fuse_init_file_inode(struct inode *inode); +void fuse_init_file_inode(struct inode *inode, unsigned int flags); /** * Initialize inode operations on regular files and special files @@ -1269,11 +1290,13 @@ ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); -int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, + struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); -void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_init(struct inode *inode, unsigned int flags); void fuse_dax_inode_cleanup(struct inode *inode); +void fuse_dax_dontcache(struct inode *inode, unsigned int flags); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); void fuse_dax_cancel_work(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8b89e3ba7df3..ee846ce371d8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -301,6 +301,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); } static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) @@ -313,7 +316,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); - fuse_init_file_inode(inode); + fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) @@ -767,8 +770,12 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",blksize=%lu", sb->s_blocksize); } #ifdef CONFIG_FUSE_DAX - if (fc->dax) - seq_puts(m, ",dax"); + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); #endif return 0; @@ -1109,73 +1116,80 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { + u64 flags = arg->flags | (u64) arg->flags2 << 32; + ra_pages = arg->max_readahead / PAGE_SIZE; - if (arg->flags & FUSE_ASYNC_READ) + if (flags & FUSE_ASYNC_READ) fc->async_read = 1; - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; if (arg->minor >= 17) { - if (!(arg->flags & FUSE_FLOCK_LOCKS)) + if (!(flags & FUSE_FLOCK_LOCKS)) fc->no_flock = 1; } else { - if (!(arg->flags & FUSE_POSIX_LOCKS)) + if (!(flags & FUSE_POSIX_LOCKS)) fc->no_flock = 1; } - if (arg->flags & FUSE_ATOMIC_O_TRUNC) + if (flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { /* LOOKUP has dependency on proto version */ - if (arg->flags & FUSE_EXPORT_SUPPORT) + if (flags & FUSE_EXPORT_SUPPORT) fc->export_support = 1; } - if (arg->flags & FUSE_BIG_WRITES) + if (flags & FUSE_BIG_WRITES) fc->big_writes = 1; - if (arg->flags & FUSE_DONT_MASK) + if (flags & FUSE_DONT_MASK) fc->dont_mask = 1; - if (arg->flags & FUSE_AUTO_INVAL_DATA) + if (flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA) + else if (flags & FUSE_EXPLICIT_INVAL_DATA) fc->explicit_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) { + if (flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) + if (flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; } - if (arg->flags & FUSE_ASYNC_DIO) + if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; - if (arg->flags & FUSE_WRITEBACK_CACHE) + if (flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; - if (arg->flags & FUSE_PARALLEL_DIROPS) + if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; - if (arg->flags & FUSE_HANDLE_KILLPRIV) + if (flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fm->sb->s_time_gran = arg->time_gran; - if ((arg->flags & FUSE_POSIX_ACL)) { + if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; fm->sb->s_xattr = fuse_acl_xattr_handlers; } - if (arg->flags & FUSE_CACHE_SYMLINKS) + if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; - if (arg->flags & FUSE_ABORT_ERROR) + if (flags & FUSE_ABORT_ERROR) fc->abort_err = 1; - if (arg->flags & FUSE_MAX_PAGES) { + if (flags & FUSE_MAX_PAGES) { fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } - if (IS_ENABLED(CONFIG_FUSE_DAX) && - arg->flags & FUSE_MAP_ALIGNMENT && - !fuse_dax_check_alignment(fc, arg->map_alignment)) { - ok = false; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; } - if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + if (flags & FUSE_HANDLE_KILLPRIV_V2) { fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } - if (arg->flags & FUSE_SETXATTR_EXT) + if (flags & FUSE_SETXATTR_EXT) fc->setxattr_ext = 1; + if (flags & FUSE_SECURITY_CTX) + fc->init_security = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1203,13 +1217,14 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; + u64 flags; ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - ia->in.flags |= + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -1219,13 +1234,19 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | + FUSE_SECURITY_CTX; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) - ia->in.flags |= FUSE_MAP_ALIGNMENT; + flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; #endif if (fm->fc->auto_submounts) - ia->in.flags |= FUSE_SUBMOUNTS; + flags |= FUSE_SUBMOUNTS; + + ia->in.flags = flags; + ia->in.flags2 = flags >> 32; ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; @@ -1514,7 +1535,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; if (IS_ENABLED(CONFIG_FUSE_DAX)) { - err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); if (err) goto err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4cfa4bc1f579..9d737904d07c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -88,12 +88,21 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +static const struct constant_table dax_param_enums[] = { + {"always", FUSE_DAX_ALWAYS }, + {"never", FUSE_DAX_NEVER }, + {"inode", FUSE_DAX_INODE_USER }, + {} +}; + enum { OPT_DAX, + OPT_DAX_ENUM, }; static const struct fs_parameter_spec virtio_fs_parameters[] = { fsparam_flag("dax", OPT_DAX), + fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), {} }; @@ -110,7 +119,10 @@ static int virtio_fs_parse_param(struct fs_context *fsc, switch (opt) { case OPT_DAX: - ctx->dax = 1; + ctx->dax_mode = FUSE_DAX_ALWAYS; + break; + case OPT_DAX_ENUM: + ctx->dax_mode = result.uint_32; break; default: return -EINVAL; @@ -753,20 +765,6 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } -static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, - pgoff_t pgoff, void *addr, - size_t bytes, struct iov_iter *i) -{ - return copy_from_iter(addr, bytes, i); -} - -static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, - pgoff_t pgoff, void *addr, - size_t bytes, struct iov_iter *i) -{ - return copy_to_iter(addr, bytes, i); -} - static int virtio_fs_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { @@ -783,8 +781,6 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, static const struct dax_operations virtio_fs_dax_ops = { .direct_access = virtio_fs_direct_access, - .copy_from_iter = virtio_fs_copy_from_iter, - .copy_to_iter = virtio_fs_copy_to_iter, .zero_page_range = virtio_fs_zero_page_range, }; @@ -850,7 +846,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); - fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); if (IS_ERR(fs->dax_dev)) return PTR_ERR(fs->dax_dev); @@ -895,7 +891,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) return 0; out_vqs: - vdev->config->reset(vdev); + virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev, fs); kfree(fs->vqs); @@ -927,7 +923,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); - vdev->config->reset(vdev); + virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev, fs); vdev->priv = NULL; @@ -1326,8 +1322,8 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; - if (ctx->dax) { - if (!fs->dax_dev) { + if (ctx->dax_mode != FUSE_DAX_NEVER) { + if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { err = -EINVAL; pr_err("virtio-fs: dax can't be enabled as filesystem" " device does not support it.\n"); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e718cfc19a7..8c39a8571b1f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -704,10 +704,11 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); - if (file->f_mode & FMODE_WRITE) + if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_delete(ip, &inode->i_writecount); gfs2_qa_put(ip); + } return 0; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 44a7a4288956..6b23399eaee0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -301,9 +301,6 @@ void gfs2_glock_queue_put(struct gfs2_glock *gl) void gfs2_glock_put(struct gfs2_glock *gl) { - /* last put could call sleepable dlm api */ - might_sleep(); - if (lockref_put_or_lock(&gl->gl_lockref)) return; @@ -477,7 +474,7 @@ find_first_strong_holder(struct gfs2_glock *gl) /* * gfs2_instantiate - Call the glops instantiate function - * @gl: The glock + * @gh: The glock holder * * Returns: 0 if instantiate was successful, 2 if type specific operation is * underway, or error. @@ -1245,7 +1242,7 @@ out: } /** - * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * __gfs2_holder_init - initialize a struct gfs2_holder in the default way * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 650ad77c4d0b..392800f082a6 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -228,7 +228,6 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); truncate_inode_pages_range(mapping, start, end); - set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); } static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, @@ -764,6 +763,7 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, + .go_dump = inode_go_dump, .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, .go_subclass = 1, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0f93e8beca4d..64c67090f503 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1244,11 +1244,9 @@ static enum dinode_demise evict_should_delete(struct inode *inode, if (ret) return SHOULD_NOT_DELETE_DINODE; - if (test_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags)) { - ret = gfs2_instantiate(gh); - if (ret) - return SHOULD_NOT_DELETE_DINODE; - } + ret = gfs2_instantiate(gh); + if (ret) + return SHOULD_NOT_DELETE_DINODE; /* * The inode may have been recreated in the meantime. diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c0a34d9ddee4..a6002b2d146d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -767,8 +767,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) wait_for_completion(&sdp->sd_kobj_unregister); } -static int gfs2_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int gfs2_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); struct super_block *s = sdp->sd_vfs; diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 456e87aec7fd..68b4240c6191 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -260,8 +260,10 @@ struct hfsplus_cat_folder { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct DInfo user_info; - struct DXInfo finder_info; + struct_group_attr(info, __packed, + struct DInfo user_info; + struct DXInfo finder_info; + ); __be32 text_encoding; __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ } __packed; @@ -294,8 +296,10 @@ struct hfsplus_cat_file { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct FInfo user_info; - struct FXInfo finder_info; + struct_group_attr(info, __packed, + struct FInfo user_info; + struct FXInfo finder_info; + ); __be32 text_encoding; u32 reserved2; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e2855ceefd39..49891b12c415 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -296,7 +296,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, sizeof(hfsplus_cat_entry)); if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { if (size == folder_finderinfo_len) { - memcpy(&entry.folder.user_info, value, + memcpy(&entry.folder.info, value, folder_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, @@ -309,7 +309,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, } } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { if (size == file_finderinfo_len) { - memcpy(&entry.file.user_info, value, + memcpy(&entry.file.info, value, file_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index d5c9d886cd9f..ef481c3d9019 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -924,6 +924,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_op = &hostfs_sbops; sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; + err = super_setup_bdi(sb); + if (err) + goto out; /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 49d2e686be74..a7c6c7498be0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) struct vm_area_struct *vma; /* - * end == 0 indicates that the entire range after - * start should be unmapped. + * end == 0 indicates that the entire range after start should be + * unmapped. Note, end is exclusive, whereas the interval tree takes + * an inclusive "last". */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_offset; unsigned long v_end; diff --git a/fs/inode.c b/fs/inode.c index 6b80a51129d5..63324df6fa27 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -67,11 +67,6 @@ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); -/* - * Statistics gathering.. - */ -struct inodes_stat_t inodes_stat; - static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); @@ -106,13 +101,43 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +/* + * Statistics gathering.. + */ +static struct inodes_stat_t inodes_stat; + +static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table inodes_sysctls[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { } +}; + +static int __init init_fs_inode_sysctls(void) +{ + register_sysctl_init("fs", inodes_sysctls); + return 0; +} +early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) @@ -526,6 +551,55 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); +void dump_mapping(const struct address_space *mapping) +{ + struct inode *host; + const struct address_space_operations *a_ops; + struct hlist_node *dentry_first; + struct dentry *dentry_ptr; + struct dentry dentry; + unsigned long ino; + + /* + * If mapping is an invalid pointer, we don't want to crash + * accessing it, so probe everything depending on it carefully. + */ + if (get_kernel_nofault(host, &mapping->host) || + get_kernel_nofault(a_ops, &mapping->a_ops)) { + pr_warn("invalid mapping:%px\n", mapping); + return; + } + + if (!host) { + pr_warn("aops:%ps\n", a_ops); + return; + } + + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { + pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); + return; + } + + if (!dentry_first) { + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + return; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (get_kernel_nofault(dentry, dentry_ptr)) { + pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + a_ops, ino, dentry_ptr); + return; + } + + /* + * if dentry is corrupted, the %pd handler may still crash, + * but it's unlikely that we reach here with a corrupt mapping + */ + pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); +} + void clear_inode(struct inode *inode) { /* diff --git a/fs/internal.h b/fs/internal.h index 7979ff8d168c..8590c973c2f4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -37,7 +37,7 @@ static inline int emergency_thaw_bdev(struct super_block *sb) /* * buffer.c */ -int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, +int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* diff --git a/fs/io-wq.c b/fs/io-wq.c index 5c4f582d6549..bb7f161bb19c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -48,7 +48,8 @@ struct io_worker { struct io_wqe *wqe; struct io_wq_work *cur_work; - spinlock_t lock; + struct io_wq_work *next_work; + raw_spinlock_t lock; struct completion ref_done; @@ -405,8 +406,7 @@ static void io_wqe_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, - struct io_wq_work *work) +static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) __must_hold(wqe->lock) { if (worker->flags & IO_WORKER_F_FREE) { @@ -529,9 +529,10 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock(&worker->lock); + raw_spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock(&worker->lock); + worker->next_work = NULL; + raw_spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -546,7 +547,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; -get_next: + /* * If we got some work, mark us as busy. If we didn't, but * the list isn't empty, it means we stalled on hashed work. @@ -555,9 +556,20 @@ get_next: * clear the stalled flag. */ work = io_get_next_work(acct, worker); - if (work) - __io_worker_busy(wqe, worker, work); - + if (work) { + __io_worker_busy(wqe, worker); + + /* + * Make sure cancelation can find this, even before + * it becomes the active work. That avoids a window + * where the work has been removed from our general + * work list, but isn't yet discoverable as the + * current work item for this worker. + */ + raw_spin_lock(&worker->lock); + worker->next_work = work; + raw_spin_unlock(&worker->lock); + } raw_spin_unlock(&wqe->lock); if (!work) break; @@ -594,11 +606,6 @@ get_next: spin_unlock_irq(&wq->hash->wait.lock); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock(&wqe->lock); - /* skip unnecessary unlock-lock wqe->lock */ - if (!work) - goto get_next; - raw_spin_unlock(&wqe->lock); } } while (work); @@ -670,7 +677,7 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = tsk->pf_io_worker; + struct io_worker *worker = tsk->worker_private; if (!worker) return; @@ -688,7 +695,7 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = tsk->pf_io_worker; + struct io_worker *worker = tsk->worker_private; if (!worker) return; @@ -707,7 +714,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, struct task_struct *tsk) { - tsk->pf_io_worker = worker; + tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; @@ -815,7 +822,7 @@ fail: refcount_set(&worker->ref, 1); worker->wqe = wqe; - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) @@ -973,6 +980,19 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } +static bool __io_wq_worker_cancel(struct io_worker *worker, + struct io_cb_cancel_data *match, + struct io_wq_work *work) +{ + if (work && match->fn(work, match->data)) { + work->flags |= IO_WQ_WORK_CANCEL; + set_notify_signal(worker->task); + return true; + } + + return false; +} + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; @@ -981,13 +1001,11 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock(&worker->lock); - if (worker->cur_work && - match->fn(worker->cur_work, match->data)) { - set_notify_signal(worker->task); + raw_spin_lock(&worker->lock); + if (__io_wq_worker_cancel(worker, match, worker->cur_work) || + __io_wq_worker_cancel(worker, match, worker->next_work)) match->nr_running++; - } - spin_unlock(&worker->lock); + raw_spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } @@ -1039,17 +1057,16 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { int i; retry: - raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { + raw_spin_lock(&wqe->lock); if (match->cancel_all) goto retry; - return; + break; } } - raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1074,25 +1091,27 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * First check pending list, if we're lucky we can just remove it * from there. CANCEL_OK means that the work is returned as-new, * no completion will be posted for it. - */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - } - - /* - * Now check if a free (going busy) or busy worker has the work + * + * Then check if a free (going busy) or busy worker has the work * currently running. If we find it there, we'll return CANCEL_RUNNING * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. + * + * Do both of these while holding the wqe->lock, to ensure that + * we'll find a work item regardless of state. */ for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; + raw_spin_lock(&wqe->lock); + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) { + raw_spin_unlock(&wqe->lock); + return IO_WQ_CANCEL_OK; + } + io_wqe_cancel_running_work(wqe, &match); + raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; } @@ -1263,7 +1282,9 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; + raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); + raw_spin_unlock(&wqe->lock); free_cpumask_var(wqe->cpu_mask); kfree(wqe); } diff --git a/fs/io-wq.h b/fs/io-wq.h index 41bf37674a49..dbecd27656c7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -52,6 +52,28 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, list->last = node; } +/** + * wq_list_merge - merge the second list to the first one. + * @list0: the first list + * @list1: the second list + * Return the first node after mergence. + */ +static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, + struct io_wq_work_list *list1) +{ + struct io_wq_work_node *ret; + + if (!list0->first) { + ret = list1->first; + } else { + ret = list0->first; + list0->last->next = list1->first; + } + INIT_WQ_LIST(list0); + INIT_WQ_LIST(list1); + return ret; +} + static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { @@ -200,6 +222,6 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { return in_task() && (current->flags & PF_IO_WORKER) && - current->pf_io_worker; + current->worker_private; } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index fb2a0cb4aaf8..4715980e9015 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,7 @@ #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> @@ -108,7 +108,8 @@ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ + IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ @@ -320,6 +321,7 @@ struct io_submit_state { bool plug_started; bool need_plug; + bool flush_cqes; unsigned short submit_nr; struct blk_plug plug; }; @@ -337,6 +339,7 @@ struct io_ring_ctx { unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; + unsigned int drain_disabled: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -471,6 +474,7 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; + struct io_wq_work_list prior_task_list; struct callback_head task_work; bool task_running; }; @@ -483,8 +487,6 @@ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; - bool canceled; struct wait_queue_entry wait; }; @@ -721,6 +723,7 @@ enum { REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, /* first byte is taken by user flags, shift it to not overlap */ REQ_F_FAIL_BIT = 8, @@ -737,6 +740,7 @@ enum { REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, + REQ_F_SKIP_LINK_CQES_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -758,6 +762,8 @@ enum { REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* IOSQE_CQE_SKIP_SUCCESS */ + REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), @@ -791,6 +797,8 @@ enum { REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + /* don't post CQEs while failing linked requests */ + REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), }; struct async_poll { @@ -882,6 +890,7 @@ struct io_kiocb { const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; + atomic_t poll_refs; }; struct io_tctx_node { @@ -1108,8 +1117,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags); +static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); + static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1183,12 +1192,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req) return atomic_dec_and_test(&req->refs); } -static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_put_and_test(req)); -} - static inline void req_ref_get(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); @@ -1264,6 +1267,26 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } +static unsigned int __io_put_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf = req->kbuf; + unsigned int cflags; + + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(kbuf); + req->kbuf = NULL; + return cflags; +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbuf(req); +} + static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) { bool got = percpu_ref_tryget(ref); @@ -1340,6 +1363,10 @@ static inline bool req_has_async_data(struct io_kiocb *req) static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; + if (req->flags & REQ_F_CQE_SKIP) { + req->flags &= ~REQ_F_CQE_SKIP; + req->flags |= REQ_F_SKIP_LINK_CQES; + } } static inline void req_fail_link_node(struct io_kiocb *req, int res) @@ -1553,8 +1580,11 @@ static void io_prep_async_link(struct io_kiocb *req) static inline void io_req_add_compl_list(struct io_kiocb *req) { - struct io_submit_state *state = &req->ctx->submit_state; + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state = &ctx->submit_state; + if (!(req->flags & REQ_F_CQE_SKIP)) + ctx->submit_state.flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } @@ -1599,7 +1629,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req->ctx, req->user_data, status, 0); + io_fill_cqe_req(req, status, 0); io_put_req_deferred(req); } } @@ -1830,6 +1860,18 @@ static inline void io_get_task_refs(int nr) io_task_refs_refill(tctx); } +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +{ + struct io_uring_task *tctx = task->io_uring; + unsigned int refs = tctx->cached_refs; + + if (refs) { + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); + } +} + static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { @@ -1858,8 +1900,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -1880,20 +1922,26 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data return io_cqring_event_overflow(ctx, user_data, res, cflags); } -/* not as hot to bloat with inlining */ -static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - return __io_cqring_fill_event(ctx, user_data, res, cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(req->ctx, req->user_data, res, cflags); } -static void io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) +{ + ctx->cq_extra++; + return __io_fill_cqe(ctx, user_data, res, cflags); +} + +static void __io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); - __io_cqring_fill_event(ctx, req->user_data, res, cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -1913,6 +1961,15 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + __io_req_complete_post(req, res, cflags); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -2101,8 +2158,8 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); - io_cqring_fill_event(link->ctx, link->user_data, - -ECANCELED, 0); + /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ + io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); return true; } @@ -2114,6 +2171,7 @@ static void io_fail_links(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *nxt, *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; req->link = NULL; while (link) { @@ -2126,7 +2184,10 @@ static void io_fail_links(struct io_kiocb *req) link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, res, 0); + if (!ignore_cqes) { + link->flags &= ~REQ_F_CQE_SKIP; + io_fill_cqe_req(link, res, 0); + } io_put_req_deferred(link); link = nxt; } @@ -2143,8 +2204,8 @@ static bool io_disarm_next(struct io_kiocb *req) req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - io_cqring_fill_event(link->ctx, link->user_data, - -ECANCELED, 0); + /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ + io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); posted = true; } @@ -2171,7 +2232,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req) spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); if (posted) - io_commit_cqring(req->ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -2208,51 +2269,108 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } +static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) +{ + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +static void handle_prev_tw_list(struct io_wq_work_node *node, + struct io_ring_ctx **ctx, bool *uring_locked) +{ + if (*ctx && !*uring_locked) + spin_lock(&(*ctx)->completion_lock); + + do { + struct io_wq_work_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != *ctx) { + if (unlikely(!*uring_locked && *ctx)) + ctx_commit_and_unlock(*ctx); + + ctx_flush_and_put(*ctx, uring_locked); + *ctx = req->ctx; + /* if not contended, grab and improve batching */ + *uring_locked = mutex_trylock(&(*ctx)->uring_lock); + percpu_ref_get(&(*ctx)->refs); + if (unlikely(!*uring_locked)) + spin_lock(&(*ctx)->completion_lock); + } + if (likely(*uring_locked)) + req->io_task_work.func(req, uring_locked); + else + __io_req_complete_post(req, req->result, io_put_kbuf(req)); + node = next; + } while (node); + + if (unlikely(!*uring_locked)) + ctx_commit_and_unlock(*ctx); +} + +static void handle_tw_list(struct io_wq_work_node *node, + struct io_ring_ctx **ctx, bool *locked) +{ + do { + struct io_wq_work_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != *ctx) { + ctx_flush_and_put(*ctx, locked); + *ctx = req->ctx; + /* if not contended, grab and improve batching */ + *locked = mutex_trylock(&(*ctx)->uring_lock); + percpu_ref_get(&(*ctx)->refs); + } + req->io_task_work.func(req, locked); + node = next; + } while (node); +} + static void tctx_task_work(struct callback_head *cb) { - bool locked = false; + bool uring_locked = false; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); while (1) { - struct io_wq_work_node *node; + struct io_wq_work_node *node1, *node2; - if (!tctx->task_list.first && locked) + if (!tctx->task_list.first && + !tctx->prior_task_list.first && uring_locked) io_submit_flush_completions(ctx); spin_lock_irq(&tctx->task_lock); - node = tctx->task_list.first; + node1 = tctx->prior_task_list.first; + node2 = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - if (!node) + INIT_WQ_LIST(&tctx->prior_task_list); + if (!node2 && !node1) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); - if (!node) + if (!node2 && !node1) break; - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &locked); - ctx = req->ctx; - /* if not contended, grab and improve batching */ - locked = mutex_trylock(&ctx->uring_lock); - percpu_ref_get(&ctx->refs); - } - req->io_task_work.func(req, &locked); - node = next; - } while (node); + if (node1) + handle_prev_tw_list(node1, &ctx, &uring_locked); + if (node2) + handle_tw_list(node2, &ctx, &uring_locked); cond_resched(); } - ctx_flush_and_put(ctx, &locked); + ctx_flush_and_put(ctx, &uring_locked); + + /* relaxed read is enough as only the task itself sets ->in_idle */ + if (unlikely(atomic_read(&tctx->in_idle))) + io_uring_drop_tctx_refs(current); } -static void io_req_task_work_add(struct io_kiocb *req) +static void io_req_task_work_add(struct io_kiocb *req, bool priority) { struct task_struct *tsk = req->task; struct io_uring_task *tctx = tsk->io_uring; @@ -2264,7 +2382,10 @@ static void io_req_task_work_add(struct io_kiocb *req) WARN_ON_ONCE(!tctx); spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + if (priority) + wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); + else + wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -2289,8 +2410,7 @@ static void io_req_task_work_add(struct io_kiocb *req) spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); + node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -2327,19 +2447,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { req->result = ret; req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } static void io_req_task_queue_reissue(struct io_kiocb *req) { req->io_task_work.func = io_queue_async_work; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } static inline void io_queue_next(struct io_kiocb *req) @@ -2403,17 +2523,22 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - spin_lock(&ctx->completion_lock); - wq_list_for_each(node, prev, &state->compl_reqs) { - struct io_kiocb *req = container_of(node, struct io_kiocb, + if (state->flush_cqes) { + spin_lock(&ctx->completion_lock); + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - __io_cqring_fill_event(ctx, req->user_data, req->result, - req->cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, req->result, + req->cflags); + } + + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + state->flush_cqes = false; } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); @@ -2444,7 +2569,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req) { if (req_ref_put_and_test(req)) { req->io_task_work.func = io_free_req_work; - io_req_task_work_add(req); + io_req_task_work_add(req, false); } } @@ -2463,24 +2588,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) -{ - unsigned int cflags; - - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; - req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(kbuf); - return cflags; -} - -static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) - return 0; - return io_put_kbuf(req, req->kbuf); -} - static inline bool io_run_task_work(void) { if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { @@ -2543,8 +2650,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break; - __io_cqring_fill_event(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); + if (unlikely(req->flags & REQ_F_CQE_SKIP)) + continue; + + __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); nr_events++; } @@ -2718,9 +2827,9 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static void io_req_task_complete(struct io_kiocb *req, bool *locked) +static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - unsigned int cflags = io_put_rw_kbuf(req); + unsigned int cflags = io_put_kbuf(req); int res = req->result; if (*locked) { @@ -2731,12 +2840,12 @@ static void io_req_task_complete(struct io_kiocb *req, bool *locked) } } -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, +static void __io_complete_rw(struct io_kiocb *req, long res, unsigned int issue_flags) { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -2747,7 +2856,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; req->result = res; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); + io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -2965,10 +3074,9 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, +static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ @@ -2980,28 +3088,21 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, } if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = kiocb->ki_pos; - if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, 0, issue_flags); + req->file->f_pos = req->rw.kiocb.ki_pos; + if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) + __io_complete_rw(req, ret, issue_flags); else - io_rw_done(kiocb, ret); + io_rw_done(&req->rw.kiocb, ret); if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { io_req_task_queue_reissue(req); } else { - unsigned int cflags = io_put_rw_kbuf(req); - struct io_ring_ctx *ctx = req->ctx; - req_set_fail(req); - if (issue_flags & IO_URING_F_UNLOCKED) { - mutex_lock(&ctx->uring_lock); - __io_req_complete(req, issue_flags, ret, cflags); - mutex_unlock(&ctx->uring_lock); - } else { - __io_req_complete(req, issue_flags, ret, cflags); - } + req->result = ret; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req, false); } } } @@ -3229,10 +3330,12 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, size_t sqe_len; ssize_t ret; - BUILD_BUG_ON(ERR_PTR(0) != NULL); - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) - return ERR_PTR(io_import_fixed(req, rw, iter)); + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { + ret = io_import_fixed(req, rw, iter); + if (ret) + return ERR_PTR(ret); + return NULL; + } /* buffer index only valid with fixed read/write, or buffer select */ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) @@ -3250,15 +3353,18 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, } ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); - return ERR_PTR(ret); + if (ret) + return ERR_PTR(ret); + return NULL; } iovec = s->fast_iov; if (req->flags & REQ_F_BUFFER_SELECT) { ret = io_iov_buffer_select(req, iovec, issue_flags); - if (!ret) - iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); - return ERR_PTR(ret); + if (ret) + return ERR_PTR(ret); + iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + return NULL; } ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, @@ -3629,7 +3735,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); } while (ret > 0); done: - kiocb_done(kiocb, ret, issue_flags); + kiocb_done(req, ret, issue_flags); out_free: /* it's faster to check here then delegate to kfree */ if (iovec) @@ -3726,7 +3832,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: - kiocb_done(kiocb, ret2, issue_flags); + kiocb_done(req, ret2, issue_flags); } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); @@ -4461,6 +4567,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) } else { list_add_tail(&buf->list, &(*head)->list); } + cond_resched(); } return i ? i : -ENOMEM; @@ -4839,17 +4946,18 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < min_ret) - req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4885,13 +4993,13 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - - if (ret < min_ret) + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; req_set_fail(req); + } __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4991,11 +5099,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); } -static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) -{ - return io_put_kbuf(req, req->kbuf); -} - static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; @@ -5033,8 +5136,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct io_buffer *kbuf; unsigned flags; - int min_ret = 0; - int ret, cflags = 0; + int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; sock = sock_from_file(req->file); @@ -5068,20 +5170,21 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_recv_kbuf(req); /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, cflags); + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); return 0; } @@ -5094,8 +5197,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct iovec iov; unsigned flags; - int min_ret = 0; - int ret, cflags = 0; + int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; sock = sock_from_file(req->file); @@ -5127,16 +5229,17 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&msg.msg_iter); ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_recv_kbuf(req); - if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, cflags); + } + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); return 0; } @@ -5303,52 +5406,23 @@ struct io_poll_table { int error; }; -static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, - __poll_t mask, io_req_tw_func_t func) -{ - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - - trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); - - list_del_init(&poll->wait.entry); +#define IO_POLL_CANCEL_FLAG BIT(31) +#define IO_POLL_REF_MASK ((1u << 20)-1) - req->result = mask; - req->io_task_work.func = func; - - /* - * If this fails, then the task is exiting. When a task exits, the - * work gets canceled, so just cancel this request as well instead - * of executing it. We can't safely execute it anyway, as we may not - * have the needed state needed for it anyway. - */ - io_req_task_work_add(req); - return 1; +/* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not + * owning it, that prevents from races for enqueueing task_work's and b/w + * arming poll and wakeups. + */ +static inline bool io_poll_get_ownership(struct io_kiocb *req) +{ + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } -static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) - __acquires(&req->ctx->completion_lock) +static void io_poll_mark_cancelled(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) - WRITE_ONCE(poll->canceled, true); - - if (!req->result && !READ_ONCE(poll->canceled)) { - struct poll_table_struct pt = { ._key = poll->events }; - - req->result = vfs_poll(req->file, &pt) & poll->events; - } - - spin_lock(&ctx->completion_lock); - if (!req->result && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); - return true; - } - - return false; + atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) @@ -5366,133 +5440,241 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) return &req->apoll->poll; } -static void io_poll_remove_double(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) +static void io_poll_req_insert(struct io_kiocb *req) { - struct io_poll_iocb *poll = io_poll_get_double(req); + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; - lockdep_assert_held(&req->ctx->completion_lock); + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; +#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) + /* mask in events that we always want/need */ + poll->events = events | IO_POLL_UNMASK; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} - if (poll && poll->head) { - struct wait_queue_head *head = poll->head; +static inline void io_poll_remove_entry(struct io_poll_iocb *poll) +{ + struct wait_queue_head *head = smp_load_acquire(&poll->head); + if (head) { spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); - if (poll->wait.private) - req_ref_put(req); poll->head = NULL; spin_unlock_irq(&head->lock); } } -static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) +static void io_poll_remove_entries(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = io_poll_get_single(req); + struct io_poll_iocb *poll_double = io_poll_get_double(req); + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + io_poll_remove_entry(poll); + if (poll_double) + io_poll_remove_entry(poll_double); + rcu_read_unlock(); +} + +/* + * All poll tw should go through this. Checks for poll events, manages + * references, does rewait, etc. + * + * Returns a negative error on failure. >0 when no action require, which is + * either spurious wakeup or multishot CQE is served. 0 when it's done with + * the request, then the mask is stored in req->result. + */ +static int io_poll_check_events(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags = IORING_CQE_F_MORE; - int error; + struct io_poll_iocb *poll = io_poll_get_single(req); + int v; - if (READ_ONCE(req->poll.canceled)) { - error = -ECANCELED; - req->poll.events |= EPOLLONESHOT; - } else { - error = mangle_poll(mask); - } - if (req->poll.events & EPOLLONESHOT) - flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { - req->poll.events |= EPOLLONESHOT; - flags = 0; - } - if (flags & IORING_CQE_F_MORE) - ctx->cq_extra++; + /* req->task == current here, checking PF_EXITING is safe */ + if (unlikely(req->task->flags & PF_EXITING)) + io_poll_mark_cancelled(req); - return !(flags & IORING_CQE_F_MORE); + do { + v = atomic_read(&req->poll_refs); + + /* tw handler should be the owner, and so have some references */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return 0; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED; + + if (!req->result) { + struct poll_table_struct pt = { ._key = poll->events }; + + req->result = vfs_poll(req->file, &pt) & poll->events; + } + + /* multishot, just fill an CQE and proceed */ + if (req->result && !(poll->events & EPOLLONESHOT)) { + __poll_t mask = mangle_poll(req->result & poll->events); + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->user_data, mask, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (unlikely(!filled)) + return -ECANCELED; + io_cqring_ev_posted(ctx); + } else if (req->result) { + return 0; + } + + /* + * Release all references, retry if someone tried to restart + * task_work while we were executing it. + */ + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + + return 1; } static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; + int ret; - if (io_poll_rewait(req, &req->poll)) { - spin_unlock(&ctx->completion_lock); + ret = io_poll_check_events(req); + if (ret > 0) + return; + + if (!ret) { + req->result = mangle_poll(req->result & req->poll.events); } else { - bool done; + req->result = ret; + req_set_fail(req); + } - if (req->poll.done) { - spin_unlock(&ctx->completion_lock); - return; - } - done = __io_poll_complete(req, req->result); - if (done) { - io_poll_remove_double(req); - hash_del(&req->hash_node); - req->poll.done = true; - } else { - req->result = 0; - add_wait_queue(req->poll.head, &req->poll.wait); - } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + __io_req_complete_post(req, req->result, 0); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} - if (done) { - nxt = io_put_req_find_next(req); - if (nxt) - io_req_task_submit(nxt, locked); - } - } +static void io_apoll_task_func(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_poll_check_events(req); + if (ret > 0) + return; + + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock(&ctx->completion_lock); + + if (!ret) + io_req_task_submit(req, locked); + else + io_req_complete_failed(req, ret); +} + +static void __io_poll_execute(struct io_kiocb *req, int mask) +{ + req->result = mask; + if (req->opcode == IORING_OP_POLL_ADD) + req->io_task_work.func = io_poll_task_func; + else + req->io_task_work.func = io_apoll_task_func; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + io_req_task_work_add(req, false); +} + +static inline void io_poll_execute(struct io_kiocb *req, int res) +{ + if (io_poll_get_ownership(req)) + __io_poll_execute(req, res); +} + +static void io_poll_cancel_req(struct io_kiocb *req) +{ + io_poll_mark_cancelled(req); + /* kick tw, which should complete the request */ + io_poll_execute(req, 0); } -static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, - int sync, void *key) +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = io_poll_get_single(req); + struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, + wait); __poll_t mask = key_to_poll(key); - unsigned long flags; - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - if (!(poll->events & EPOLLONESHOT)) - return poll->wait.func(&poll->wait, mode, sync, key); + if (unlikely(mask & POLLFREE)) { + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0); - list_del_init(&wait->entry); + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); - if (poll->head) { - bool done; + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; + } + + /* for instances that support it check for an event match first */ + if (mask && !(mask & poll->events)) + return 0; - spin_lock_irqsave(&poll->head->lock, flags); - done = list_empty(&poll->wait.entry); - if (!done) + if (io_poll_get_ownership(req)) { + /* optional, saves extra locking for removal in tw handler */ + if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); - /* make sure double remove sees this as being gone */ - wait->private = NULL; - spin_unlock_irqrestore(&poll->head->lock, flags); - if (!done) { - /* use wait func handler, so it matches the rq type */ - poll->wait.func(&poll->wait, mode, sync, key); + poll->head = NULL; } + __io_poll_execute(req, mask); } - req_ref_put(req); return 1; } -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; - poll->done = false; - poll->canceled = false; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll_iocb **poll_ptr) @@ -5505,10 +5687,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * if this happens. */ if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *poll_one = poll; + struct io_poll_iocb *first = poll; /* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) + if (first->head == head) return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { @@ -5517,21 +5699,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -EINVAL; return; } - /* - * Can't handle multishot for double wait for now, turn it - * into one-shot mode. - */ - if (!(poll_one->events & EPOLLONESHOT)) - poll_one->events |= EPOLLONESHOT; + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; } - io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); - req_ref_get(req); - poll->wait.private = req; - + io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; @@ -5539,6 +5713,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->nr_entries++; poll->head = head; + poll->wait.private = req; if (poll->events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(head, &poll->wait); @@ -5546,70 +5721,24 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, add_wait_queue(head, &poll->wait); } -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll; - - __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -static void io_async_task_func(struct io_kiocb *req, bool *locked) -{ - struct async_poll *apoll = req->apoll; - struct io_ring_ctx *ctx = req->ctx; - trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); - - if (io_poll_rewait(req, &apoll->poll)) { - spin_unlock(&ctx->completion_lock); - return; - } - - hash_del(&req->hash_node); - io_poll_remove_double(req); - apoll->poll.done = true; - spin_unlock(&ctx->completion_lock); - - if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, -ECANCELED); -} - -static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = &req->apoll->poll; - - trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, - key_to_poll(key)); - - return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + __io_queue_proc(&pt->req->poll, pt, head, + (struct io_poll_iocb **) &pt->req->async_data); } -static __poll_t __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, - struct io_poll_table *ipt, __poll_t mask, - wait_queue_func_t wake_func) - __acquires(&ctx->completion_lock) +static int __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask) { struct io_ring_ctx *ctx = req->ctx; - bool cancel = false; + int v; INIT_HLIST_NODE(&req->hash_node); - io_init_poll_iocb(poll, mask, wake_func); + io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; poll->wait.private = req; @@ -5618,31 +5747,54 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, ipt->error = 0; ipt->nr_entries = 0; + /* + * Take the ownership to delay any tw execution up until we're done + * with poll arming. see io_poll_get_ownership(). + */ + atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; - if (unlikely(!ipt->nr_entries) && !ipt->error) - ipt->error = -EINVAL; + + if (mask && (poll->events & EPOLLONESHOT)) { + io_poll_remove_entries(req); + /* no one else has access to the req, forget about the ref */ + return mask; + } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + if (!ipt->error) + ipt->error = -EINVAL; + return 0; + } spin_lock(&ctx->completion_lock); - if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) - io_poll_remove_double(req); - if (likely(poll->head)) { - spin_lock_irq(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt->error) - cancel = true; - ipt->error = 0; - mask = 0; - } - if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock_irq(&poll->head->lock); + io_poll_req_insert(req); + spin_unlock(&ctx->completion_lock); + + if (mask) { + /* can't multishot if failed, just queue the event we've got */ + if (unlikely(ipt->error || !ipt->nr_entries)) + poll->events |= EPOLLONESHOT; + __io_poll_execute(req, mask); + return 0; } - return mask; + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0); + return 0; +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; + + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } enum { @@ -5657,7 +5809,8 @@ static int io_arm_poll_handler(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + int ret; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; @@ -5682,11 +5835,8 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_req_set_refcount(req); - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, - io_async_wake); - spin_unlock(&ctx->completion_lock); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; @@ -5695,43 +5845,6 @@ static int io_arm_poll_handler(struct io_kiocb *req) return IO_APOLL_OK; } -static bool __io_poll_remove_one(struct io_kiocb *req, - struct io_poll_iocb *poll, bool do_cancel) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete = false; - - if (!poll->head) - return false; - spin_lock_irq(&poll->head->lock); - if (do_cancel) - WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); - do_complete = true; - } - spin_unlock_irq(&poll->head->lock); - hash_del(&req->hash_node); - return do_complete; -} - -static bool io_poll_remove_one(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete; - - io_poll_remove_double(req); - do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - - if (do_complete) { - io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); - io_commit_cqring(req->ctx); - req_set_fail(req); - io_put_req_deferred(req); - } - return do_complete; -} - /* * Returns true if we found and killed one or more poll requests */ @@ -5740,7 +5853,8 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, { struct hlist_node *tmp; struct io_kiocb *req; - int posted = 0, i; + bool found = false; + int i; spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -5748,16 +5862,14 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) - posted += io_poll_remove_one(req); + if (io_match_task_safe(req, tsk, cancel_all)) { + io_poll_cancel_req(req); + found = true; + } } } spin_unlock(&ctx->completion_lock); - - if (posted) - io_cqring_ev_posted(ctx); - - return posted != 0; + return found; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, @@ -5778,19 +5890,26 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, return NULL; } +static bool io_poll_disarm(struct io_kiocb *req) + __must_hold(&ctx->completion_lock) +{ + if (!io_poll_get_ownership(req)) + return false; + io_poll_remove_entries(req); + hash_del(&req->hash_node); + return true; +} + static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, bool poll_only) __must_hold(&ctx->completion_lock) { - struct io_kiocb *req; + struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); - req = io_poll_find(ctx, sqe_addr, poll_only); if (!req) return -ENOENT; - if (io_poll_remove_one(req)) - return 0; - - return -EALREADY; + io_poll_cancel_req(req); + return 0; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -5840,23 +5959,6 @@ static int io_poll_update_prep(struct io_kiocb *req, return 0; } -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = &req->poll; - - return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - - __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); -} - static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; @@ -5869,6 +5971,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; + if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) + return -EINVAL; io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); @@ -5878,100 +5982,60 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - __poll_t mask; - bool done; + int ret; ipt.pt._qproc = io_poll_queue_proc; - mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, - io_poll_wake); - - if (mask) { /* no async, we'd stolen it */ - ipt.error = 0; - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - } - spin_unlock(&ctx->completion_lock); - - if (mask) { - io_cqring_ev_posted(ctx); - if (done) - io_put_req(req); - } - return ipt.error; + ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + ret = ret ?: ipt.error; + if (ret) + __io_req_complete(req, issue_flags, ret, 0); + return 0; } static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; - bool completing; - int ret; + int ret2, ret = 0; + bool locked; spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); - if (!preq) { - ret = -ENOENT; - goto err; - } - - if (!req->poll_update.update_events && !req->poll_update.update_user_data) { - completing = true; - ret = io_poll_remove_one(preq) ? 0 : -EALREADY; - goto err; - } - - /* - * Don't allow racy completion with singleshot, as we cannot safely - * update those. For multishot, if we're racing with completion, just - * let completion re-add it. - */ - completing = !__io_poll_remove_one(preq, &preq->poll, false); - if (completing && (preq->poll.events & EPOLLONESHOT)) { - ret = -EALREADY; - goto err; - } - /* we now have a detached poll request. reissue. */ - ret = 0; -err: - if (ret < 0) { + if (!preq || !io_poll_disarm(preq)) { spin_unlock(&ctx->completion_lock); - req_set_fail(req); - io_req_complete(req, ret); - return 0; - } - /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; + ret = preq ? -EALREADY : -ENOENT; + goto out; } - if (req->poll_update.update_user_data) - preq->user_data = req->poll_update.new_user_data; spin_unlock(&ctx->completion_lock); - /* complete update request, we're done with it */ - io_req_complete(req, ret); - - if (!completing) { - ret = io_poll_add(preq, issue_flags); - if (ret < 0) { - req_set_fail(preq); - io_req_complete(preq, ret); + if (req->poll_update.update_events || req->poll_update.update_user_data) { + /* only mask one event flags, keep behavior flags */ + if (req->poll_update.update_events) { + preq->poll.events &= ~0xffff; + preq->poll.events |= req->poll_update.events & 0xffff; + preq->poll.events |= IO_POLL_UNMASK; } - } - return 0; -} + if (req->poll_update.update_user_data) + preq->user_data = req->poll_update.new_user_data; -static void io_req_task_timeout(struct io_kiocb *req, bool *locked) -{ - struct io_timeout_data *data = req->async_data; + ret2 = io_poll_add(preq, issue_flags); + /* successfully updated, don't complete poll request */ + if (!ret2) + goto out; + } - if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(preq); + preq->result = -ECANCELED; + locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &locked); +out: + if (ret < 0) req_set_fail(req); - io_req_complete_post(req, -ETIME, 0); + /* complete update request, we're done with it */ + __io_req_complete(req, issue_flags, ret, 0); + return 0; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -5988,8 +6052,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) atomic_read(&req->ctx->cq_timeouts) + 1); spin_unlock_irqrestore(&ctx->timeout_lock, flags); - req->io_task_work.func = io_req_task_timeout; - io_req_task_work_add(req); + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); + + req->result = -ETIME; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req, false); return HRTIMER_NORESTART; } @@ -6026,7 +6094,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return PTR_ERR(req); req_set_fail(req); - io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); + io_fill_cqe_req(req, -ECANCELED, 0); io_put_req_deferred(req); return 0; } @@ -6115,6 +6183,8 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; + if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) + return -EINVAL; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; @@ -6316,16 +6386,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - if (ret != -ENOENT) - return ret; + /* + * Fall-through even for -EALREADY, as we may have poll armed + * that need unarming. + */ + if (!ret) + return 0; spin_lock(&ctx->completion_lock); + ret = io_poll_cancel(ctx, sqe_addr, false); + if (ret != -ENOENT) + goto out; + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); spin_unlock_irq(&ctx->timeout_lock); - if (ret != -ENOENT) - goto out; - ret = io_poll_cancel(ctx, sqe_addr, false); out: spin_unlock(&ctx->completion_lock); return ret; @@ -6544,12 +6619,15 @@ static __cold void io_drain_req(struct io_kiocb *req) u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ + spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { + spin_unlock(&ctx->completion_lock); queue: ctx->drain_active = false; io_req_task_queue(req); return; } + spin_unlock(&ctx->completion_lock); ret = io_req_prep_async(req); if (ret) { @@ -6580,10 +6658,8 @@ fail: static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - kfree(req->kbuf); - req->kbuf = NULL; - } + if (req->flags & REQ_F_BUFFER_SELECTED) + io_put_kbuf(req); if (req->flags & REQ_F_NEED_CLEANUP) { switch (req->opcode) { @@ -6965,7 +7041,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req); + io_req_task_work_add(req, false); return HRTIMER_NORESTART; } @@ -7100,10 +7176,10 @@ static void io_init_req_drain(struct io_kiocb *req) * If we need to drain a request in the middle of a link, drain * the head request and the next request/link after the current * link. Considering sequential execution of links, - * IOSQE_IO_DRAIN will be maintained for every request of our + * REQ_F_IO_DRAIN will be maintained for every request of our * link. */ - head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; ctx->drain_next = true; } } @@ -7136,8 +7212,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[opcode].buffer_select) return -EOPNOTSUPP; - if (sqe_flags & IOSQE_IO_DRAIN) + if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) + ctx->drain_disabled = true; + if (sqe_flags & IOSQE_IO_DRAIN) { + if (ctx->drain_disabled) + return -EOPNOTSUPP; io_init_req_drain(req); + } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) @@ -7149,7 +7230,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { ctx->drain_next = false; ctx->drain_active = true; - req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -7613,7 +7694,7 @@ static int io_run_task_work_sig(void) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - signed long *timeout) + ktime_t timeout) { int ret; @@ -7625,8 +7706,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (test_bit(0, &ctx->check_cq_overflow)) return 1; - *timeout = schedule_timeout(*timeout); - return !*timeout ? -ETIME : 1; + if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + return -ETIME; + return 1; } /* @@ -7639,7 +7721,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - signed long timeout = MAX_SCHEDULE_TIMEOUT; + ktime_t timeout = KTIME_MAX; int ret; do { @@ -7655,7 +7737,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (get_timespec64(&ts, uts)) return -EFAULT; - timeout = timespec64_to_jiffies(&ts); + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } if (sig) { @@ -7687,7 +7769,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, timeout); finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); @@ -7741,10 +7823,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) struct io_ring_ctx *ctx = node->rsrc_data->ctx; unsigned long flags; bool first_add = false; + unsigned long delay = HZ; spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); node->done = true; + /* if we are mid-quiesce then do not delay */ + if (node->rsrc_data->quiesce) + delay = 0; + while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); @@ -7757,10 +7844,10 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); + mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +static struct io_rsrc_node *io_rsrc_node_alloc(void) { struct io_rsrc_node *ref_node; @@ -7811,7 +7898,7 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); + ctx->rsrc_backup_node = io_rsrc_node_alloc(); return ctx->rsrc_backup_node ? 0 : -ENOMEM; } @@ -7839,7 +7926,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - break; + if (atomic_read(&data->refs) > 0) { + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); + } else { + break; + } } atomic_inc(&data->refs); @@ -8263,8 +8358,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) io_ring_submit_lock(ctx, lock_ring); spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, prsrc->tag, 0, 0); - ctx->cq_extra++; + io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -8676,6 +8770,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); + INIT_WQ_LIST(&tctx->prior_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -8847,10 +8942,9 @@ static void io_mem_free(void *ptr) static void *io_mem_alloc(size_t size) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | - __GFP_NORETRY | __GFP_ACCOUNT; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - return (void *) __get_free_pages(gfp_flags, get_order(size)); + return (void *) __get_free_pages(gfp, get_order(size)); } static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, @@ -9814,18 +9908,6 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - unsigned int refs = tctx->cached_refs; - - if (refs) { - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); - } -} - /* * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. @@ -9883,10 +9965,14 @@ static __cold void io_uring_cancel_generic(bool cancel_all, schedule(); finish_wait(&tctx->wait, &wait); } while (1); - atomic_dec(&tctx->in_idle); io_uring_clean_tctx(tctx); if (cancel_all) { + /* + * We shouldn't run task_works after cancel, so just leave + * ->in_idle set for normal exit. + */ + atomic_dec(&tctx->in_idle); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } @@ -10164,7 +10250,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, * and sq_tail and cq_head are changed by userspace. But it's ok since * we usually use these info when it is stuck. */ - seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); + seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); @@ -10473,7 +10559,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS; + IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/fs/ioctl.c b/fs/ioctl.c index 504e69578112..1ed097e94af2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -430,7 +430,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range __user, info[count]); + size = offsetof(struct file_dedupe_range, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 4143a3ff89db..fc070184b7fa 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o iomap-y += trace.o \ - buffered-io.o \ + iter.o +iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ fiemap.o \ - iter.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 71a36ae120ee..6c51a75d0be6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -21,9 +21,11 @@ #include "../internal.h" +#define IOEND_BATCH_SIZE 4096 + /* - * Structure allocated for each page or THP when block size < page size - * to track sub-page uptodate status and I/O completions. + * Structure allocated for each folio when block size < folio size + * to track sub-folio uptodate status and I/O completions. */ struct iomap_page { atomic_t read_bytes_pending; @@ -32,27 +34,20 @@ struct iomap_page { unsigned long uptodate[]; }; -static inline struct iomap_page *to_iomap_page(struct page *page) +static inline struct iomap_page *to_iomap_page(struct folio *folio) { - /* - * per-block data is stored in the head page. Callers should - * not be dealing with tail pages, and if they are, they can - * call thp_head() first. - */ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - - if (page_has_private(page)) - return (struct iomap_page *)page_private(page); + if (folio_test_private(folio)) + return folio_get_private(folio); return NULL; } static struct bio_set iomap_ioend_bioset; static struct iomap_page * -iomap_page_create(struct inode *inode, struct page *page) +iomap_page_create(struct inode *inode, struct folio *folio) { - struct iomap_page *iop = to_iomap_page(page); - unsigned int nr_blocks = i_blocks_per_page(inode, page); + struct iomap_page *iop = to_iomap_page(folio); + unsigned int nr_blocks = i_blocks_per_folio(inode, folio); if (iop || nr_blocks <= 1) return iop; @@ -60,40 +55,40 @@ iomap_page_create(struct inode *inode, struct page *page) iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) bitmap_fill(iop->uptodate, nr_blocks); - attach_page_private(page, iop); + folio_attach_private(folio, iop); return iop; } -static void -iomap_page_release(struct page *page) +static void iomap_page_release(struct folio *folio) { - struct iomap_page *iop = detach_page_private(page); - unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); + struct iomap_page *iop = folio_detach_private(folio); + struct inode *inode = folio->mapping->host; + unsigned int nr_blocks = i_blocks_per_folio(inode, folio); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != - PageUptodate(page)); + folio_test_uptodate(folio)); kfree(iop); } /* - * Calculate the range inside the page that we actually need to read. + * Calculate the range inside the folio that we actually need to read. */ -static void -iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, - loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) +static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, + loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { + struct iomap_page *iop = to_iomap_page(folio); loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); - unsigned poff = offset_in_page(*pos); - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); + size_t poff = offset_in_folio(folio, *pos); + size_t plen = min_t(loff_t, folio_size(folio) - poff, length); unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; @@ -131,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, * page cache for blocks that are entirely outside of i_size. */ if (orig_pos <= isize && orig_pos + length > isize) { - unsigned end = offset_in_page(isize - 1) >> block_bits; + unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) plen -= (last - end) * block_size; @@ -141,66 +136,62 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, *lenp = plen; } -static void -iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) +static void iomap_iop_set_range_uptodate(struct folio *folio, + struct iomap_page *iop, size_t off, size_t len) { - struct iomap_page *iop = to_iomap_page(page); - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; unsigned long flags; spin_lock_irqsave(&iop->uptodate_lock, flags); bitmap_set(iop->uptodate, first, last - first + 1); - if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) - SetPageUptodate(page); + if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio))) + folio_mark_uptodate(folio); spin_unlock_irqrestore(&iop->uptodate_lock, flags); } -static void -iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) +static void iomap_set_range_uptodate(struct folio *folio, + struct iomap_page *iop, size_t off, size_t len) { - if (PageError(page)) + if (folio_test_error(folio)) return; - if (page_has_private(page)) - iomap_iop_set_range_uptodate(page, off, len); + if (iop) + iomap_iop_set_range_uptodate(folio, iop, off, len); else - SetPageUptodate(page); + folio_mark_uptodate(folio); } -static void -iomap_read_page_end_io(struct bio_vec *bvec, int error) +static void iomap_finish_folio_read(struct folio *folio, size_t offset, + size_t len, int error) { - struct page *page = bvec->bv_page; - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = to_iomap_page(folio); if (unlikely(error)) { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(folio); + folio_set_error(folio); } else { - iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); + iomap_set_range_uptodate(folio, iop, offset, len); } - if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) - unlock_page(page); + if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) + folio_unlock(folio); } -static void -iomap_read_end_io(struct bio *bio) +static void iomap_read_end_io(struct bio *bio) { int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) - iomap_read_page_end_io(bvec, error); + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); bio_put(bio); } struct iomap_readpage_ctx { - struct page *cur_page; - bool cur_page_in_bio; + struct folio *cur_folio; + bool cur_folio_in_bio; struct bio *bio; struct readahead_control *rac; }; @@ -208,21 +199,23 @@ struct iomap_readpage_ctx { /** * iomap_read_inline_data - copy inline data into the page cache * @iter: iteration structure - * @page: page to copy to + * @folio: folio to copy to * - * Copy the inline data in @iter into @page and zero out the rest of the page. + * Copy the inline data in @iter into @folio and zero out the rest of the folio. * Only a single IOMAP_INLINE extent is allowed at the end of each file. * Returns zero for success to complete the read, or the usual negative errno. */ static int iomap_read_inline_data(const struct iomap_iter *iter, - struct page *page) + struct folio *folio) { + struct iomap_page *iop; const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); + size_t offset = offset_in_folio(folio, iomap->offset); void *addr; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) @@ -232,14 +225,16 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; - if (poff > 0) - iomap_page_create(iter->inode, page); + if (offset > 0) + iop = iomap_page_create(iter->inode, folio); + else + iop = to_iomap_page(folio); - addr = kmap_local_page(page) + poff; + addr = kmap_local_folio(folio, offset); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_local(addr); - iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); + iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); return 0; } @@ -259,36 +254,36 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; - struct page *page = ctx->cur_page; + struct folio *folio = ctx->cur_folio; struct iomap_page *iop; loff_t orig_pos = pos; - unsigned poff, plen; + size_t poff, plen; sector_t sector; if (iomap->type == IOMAP_INLINE) - return iomap_read_inline_data(iter, page); + return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, page); - iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen); + iop = iomap_page_create(iter->inode, folio); + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { - zero_user(page, poff, plen); - iomap_set_range_uptodate(page, poff, plen); + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, iop, poff, plen); goto done; } - ctx->cur_page_in_bio = true; + ctx->cur_folio_in_bio = true; if (iop) atomic_add(plen, &iop->read_bytes_pending); sector = iomap_sector(iomap, pos); if (!ctx->bio || bio_end_sector(ctx->bio) != sector || - bio_add_page(ctx->bio, page, plen, poff) != plen) { - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + !bio_add_folio(ctx->bio, folio, plen, poff)) { + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -311,8 +306,9 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; - __bio_add_page(ctx->bio, page, plen, poff); + bio_add_folio(ctx->bio, folio, plen, poff); } + done: /* * Move the caller beyond our range so that it keeps making progress. @@ -326,30 +322,31 @@ done: int iomap_readpage(struct page *page, const struct iomap_ops *ops) { + struct folio *folio = page_folio(page); struct iomap_iter iter = { - .inode = page->mapping->host, - .pos = page_offset(page), - .len = PAGE_SIZE, + .inode = folio->mapping->host, + .pos = folio_pos(folio), + .len = folio_size(folio), }; struct iomap_readpage_ctx ctx = { - .cur_page = page, + .cur_folio = folio, }; int ret; - trace_iomap_readpage(page->mapping->host, 1); + trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_readpage_iter(&iter, &ctx, 0); if (ret < 0) - SetPageError(page); + folio_set_error(folio); if (ctx.bio) { submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_page_in_bio); + WARN_ON_ONCE(!ctx.cur_folio_in_bio); } else { - WARN_ON_ONCE(ctx.cur_page_in_bio); - unlock_page(page); + WARN_ON_ONCE(ctx.cur_folio_in_bio); + folio_unlock(folio); } /* @@ -368,15 +365,15 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, loff_t done, ret; for (done = 0; done < length; done += ret) { - if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) { - if (!ctx->cur_page_in_bio) - unlock_page(ctx->cur_page); - put_page(ctx->cur_page); - ctx->cur_page = NULL; + if (ctx->cur_folio && + offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + if (!ctx->cur_folio_in_bio) + folio_unlock(ctx->cur_folio); + ctx->cur_folio = NULL; } - if (!ctx->cur_page) { - ctx->cur_page = readahead_page(ctx->rac); - ctx->cur_page_in_bio = false; + if (!ctx->cur_folio) { + ctx->cur_folio = readahead_folio(ctx->rac); + ctx->cur_folio_in_bio = false; } ret = iomap_readpage_iter(iter, ctx, done); if (ret <= 0) @@ -419,10 +416,9 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) if (ctx.bio) submit_bio(ctx.bio); - if (ctx.cur_page) { - if (!ctx.cur_page_in_bio) - unlock_page(ctx.cur_page); - put_page(ctx.cur_page); + if (ctx.cur_folio) { + if (!ctx.cur_folio_in_bio) + folio_unlock(ctx.cur_folio); } } EXPORT_SYMBOL_GPL(iomap_readahead); @@ -438,7 +434,8 @@ int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count) { - struct iomap_page *iop = to_iomap_page(page); + struct folio *folio = page_folio(page); + struct iomap_page *iop = to_iomap_page(folio); struct inode *inode = page->mapping->host; unsigned len, first, last; unsigned i; @@ -464,36 +461,49 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); int iomap_releasepage(struct page *page, gfp_t gfp_mask) { - trace_iomap_releasepage(page->mapping->host, page_offset(page), - PAGE_SIZE); + struct folio *folio = page_folio(page); + + trace_iomap_releasepage(folio->mapping->host, folio_pos(folio), + folio_size(folio)); /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to * ->releasepage() via shrink_active_list(); skip those here. */ - if (PageDirty(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; - iomap_page_release(page); + iomap_page_release(folio); return 1; } EXPORT_SYMBOL_GPL(iomap_releasepage); -void -iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) +void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) { - trace_iomap_invalidatepage(page->mapping->host, offset, len); + trace_iomap_invalidatepage(folio->mapping->host, offset, len); /* - * If we're invalidating the entire page, clear the dirty state from it - * and release it to avoid unnecessary buildup of the LRU. + * If we're invalidating the entire folio, clear the dirty state + * from it and release it to avoid unnecessary buildup of the LRU. */ - if (offset == 0 && len == PAGE_SIZE) { - WARN_ON_ONCE(PageWriteback(page)); - cancel_dirty_page(page); - iomap_page_release(page); + if (offset == 0 && len == folio_size(folio)) { + WARN_ON_ONCE(folio_test_writeback(folio)); + folio_cancel_dirty(folio); + iomap_page_release(folio); + } else if (folio_test_large(folio)) { + /* Must release the iop so the page can be split */ + WARN_ON_ONCE(!folio_test_uptodate(folio) && + folio_test_dirty(folio)); + iomap_page_release(folio); } } +EXPORT_SYMBOL_GPL(iomap_invalidate_folio); + +void iomap_invalidatepage(struct page *page, unsigned int offset, + unsigned int len) +{ + iomap_invalidate_folio(page_folio(page), offset, len); +} EXPORT_SYMBOL_GPL(iomap_invalidatepage); #ifdef CONFIG_MIGRATION @@ -501,19 +511,21 @@ int iomap_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { + struct folio *folio = page_folio(page); + struct folio *newfolio = page_folio(newpage); int ret; - ret = migrate_page_move_mapping(mapping, newpage, page, 0); + ret = folio_migrate_mapping(mapping, newfolio, folio, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); + if (folio_test_private(folio)) + folio_attach_private(newfolio, folio_detach_private(folio)); if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(newfolio, folio); else - migrate_page_states(newpage, page); + folio_migrate_flags(newfolio, folio); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL_GPL(iomap_migrate_page); @@ -532,9 +544,8 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) truncate_pagecache_range(inode, max(pos, i_size), pos + len); } -static int -iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, - unsigned plen, const struct iomap *iomap) +static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, + size_t poff, size_t plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; @@ -543,26 +554,27 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, bio.bi_opf = REQ_OP_READ; bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); bio_set_dev(&bio, iomap->bdev); - __bio_add_page(&bio, page, plen, poff); + bio_add_folio(&bio, folio, plen, poff); return submit_bio_wait(&bio); } static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, - unsigned len, struct page *page) + size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop = iomap_page_create(iter->inode, page); + struct iomap_page *iop = iomap_page_create(iter->inode, folio); loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); - unsigned from = offset_in_page(pos), to = from + len, poff, plen; + size_t from = offset_in_folio(folio, pos), to = from + len; + size_t poff, plen; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; - ClearPageError(page); + folio_clear_error(folio); do { - iomap_adjust_read_range(iter->inode, iop, &block_start, + iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; @@ -575,34 +587,35 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (iomap_block_needs_zeroing(iter, block_start)) { if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; - zero_user_segments(page, poff, from, to, poff + plen); + folio_zero_segments(folio, poff, from, to, poff + plen); } else { - int status = iomap_read_page_sync(block_start, page, + int status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; } - iomap_set_range_uptodate(page, poff, plen); + iomap_set_range_uptodate(folio, iop, poff, plen); } while ((block_start += plen) < block_end); return 0; } static int iomap_write_begin_inline(const struct iomap_iter *iter, - struct page *page) + struct folio *folio) { /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) return -EIO; - return iomap_read_inline_data(iter, page); + return iomap_read_inline_data(iter, folio); } static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, - unsigned len, struct page **pagep) + size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct page *page; + struct folio *folio; + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); @@ -612,35 +625,40 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (fatal_signal_pending(current)) return -EINTR; + if (!mapping_large_folio_support(iter->inode->i_mapping)) + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + if (page_ops && page_ops->page_prepare) { status = page_ops->page_prepare(iter->inode, pos, len); if (status) return status; } - page = grab_cache_page_write_begin(iter->inode->i_mapping, - pos >> PAGE_SHIFT, AOP_FLAG_NOFS); - if (!page) { + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + fgp, mapping_gfp_mask(iter->inode->i_mapping)); + if (!folio) { status = -ENOMEM; goto out_no_page; } + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; if (srcmap->type == IOMAP_INLINE) - status = iomap_write_begin_inline(iter, page); + status = iomap_write_begin_inline(iter, folio); else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) - status = __block_write_begin_int(page, pos, len, NULL, srcmap); + status = __block_write_begin_int(folio, pos, len, NULL, srcmap); else - status = __iomap_write_begin(iter, pos, len, page); + status = __iomap_write_begin(iter, pos, len, folio); if (unlikely(status)) goto out_unlock; - *pagep = page; + *foliop = folio; return 0; out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); iomap_write_failed(iter->inode, pos, len); out_no_page: @@ -650,9 +668,10 @@ out_no_page: } static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, - size_t copied, struct page *page) + size_t copied, struct folio *folio) { - flush_dcache_page(page); + struct iomap_page *iop = to_iomap_page(folio); + flush_dcache_folio(folio); /* * The blocks that were entirely written will now be uptodate, so we @@ -665,24 +684,24 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, * non-uptodate page as a zero-length write, and force the caller to * redo the whole thing. */ - if (unlikely(copied < len && !PageUptodate(page))) + if (unlikely(copied < len && !folio_test_uptodate(folio))) return 0; - iomap_set_range_uptodate(page, offset_in_page(pos), len); - __set_page_dirty_nobuffers(page); + iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); + filemap_dirty_folio(inode->i_mapping, folio); return copied; } static size_t iomap_write_end_inline(const struct iomap_iter *iter, - struct page *page, loff_t pos, size_t copied) + struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; void *addr; - WARN_ON_ONCE(!PageUptodate(page)); + WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); - flush_dcache_page(page); - addr = kmap_local_page(page) + pos; + flush_dcache_folio(folio); + addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); @@ -692,7 +711,7 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, - size_t copied, struct page *page) + size_t copied, struct folio *folio) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -700,12 +719,12 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(iter, page, pos, copied); + ret = iomap_write_end_inline(iter, folio, pos, copied); } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, - copied, page, NULL); + copied, &folio->page, NULL); } else { - ret = __iomap_write_end(iter->inode, pos, len, copied, page); + ret = __iomap_write_end(iter->inode, pos, len, copied, folio); } /* @@ -717,13 +736,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, i_size_write(iter->inode, pos + ret); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - unlock_page(page); + folio_unlock(folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, ret, page); - put_page(page); + page_ops->page_done(iter->inode, pos, ret, &folio->page); + folio_put(folio); if (ret < len) iomap_write_failed(iter->inode, pos, len); @@ -738,6 +757,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) long status = 0; do { + struct folio *folio; struct page *page; unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ @@ -761,16 +781,17 @@ again: break; } - status = iomap_write_begin(iter, pos, bytes, &page); + status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(iter->inode->i_mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); - status = iomap_write_end(iter, pos, bytes, copied, page); + status = iomap_write_end(iter, pos, bytes, copied, folio); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); @@ -836,13 +857,13 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) do { unsigned long offset = offset_in_page(pos); unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); - struct page *page; + struct folio *folio; - status = iomap_write_begin(iter, pos, bytes, &page); + status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - status = iomap_write_end(iter, pos, bytes, bytes, page); + status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -876,26 +897,8 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) -{ - struct page *page; - int status; - unsigned offset = offset_in_page(pos); - unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); - - status = iomap_write_begin(iter, pos, bytes, &page); - if (status) - return status; - - zero_user(page, offset, bytes); - mark_page_accessed(page); - - return iomap_write_end(iter, pos, bytes, bytes, page); -} - static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); @@ -906,14 +909,25 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) return length; do { - s64 bytes; + struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); - if (IS_DAX(iter->inode)) - bytes = dax_iomap_zero(pos, length, iomap); - else - bytes = __iomap_zero_iter(iter, pos, length); - if (bytes < 0) - return bytes; + status = iomap_write_begin(iter, pos, bytes, &folio); + if (status) + return status; + + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + folio_zero_range(folio, offset, bytes); + folio_mark_accessed(folio); + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) + return -EIO; pos += bytes; length -= bytes; @@ -957,21 +971,21 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter, - struct page *page) +static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, + struct folio *folio) { loff_t length = iomap_length(iter); int ret; if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { - ret = __block_write_begin_int(page, iter->pos, length, NULL, + ret = __block_write_begin_int(folio, iter->pos, length, NULL, &iter->iomap); if (ret) return ret; - block_commit_write(page, 0, length); + block_commit_write(&folio->page, 0, length); } else { - WARN_ON_ONCE(!PageUptodate(page)); - set_page_dirty(page); + WARN_ON_ONCE(!folio_test_uptodate(folio)); + folio_mark_dirty(folio); } return length; @@ -983,44 +997,43 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, }; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); ssize_t ret; - lock_page(page); - ret = page_mkwrite_check_truncate(page, iter.inode); + folio_lock(folio); + ret = folio_mkwrite_check_truncate(folio, iter.inode); if (ret < 0) goto out_unlock; - iter.pos = page_offset(page); + iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_page_mkwrite_iter(&iter, page); + iter.processed = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -static void -iomap_finish_page_writeback(struct inode *inode, struct page *page, - int error, unsigned int len) +static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, + size_t len, int error) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = to_iomap_page(folio); if (error) { - SetPageError(page); + folio_set_error(folio); mapping_set_error(inode->i_mapping, error); } - WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) - end_page_writeback(page); + folio_end_writeback(folio); } /* @@ -1028,7 +1041,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static void +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; @@ -1037,10 +1050,10 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) u64 start = bio->bi_iter.bi_sector; loff_t offset = ioend->io_offset; bool quiet = bio_flagged(bio, BIO_QUIET); + u32 folio_count = 0; for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; /* * For the last bio, bi_private points to the ioend, so we @@ -1051,10 +1064,12 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) else next = bio->bi_private; - /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bv, bio, iter_all) - iomap_finish_page_writeback(inode, bv->bv_page, error, - bv->bv_len); + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + iomap_finish_folio_write(inode, fi.folio, fi.length, + error); + folio_count++; + } bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1064,20 +1079,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) "%s: writeback error on inode %lu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, offset, start); } + return folio_count; } +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ void iomap_finish_ioends(struct iomap_ioend *ioend, int error) { struct list_head tmp; + u32 completions; + + might_sleep(); list_replace_init(&ioend->io_list, &tmp); - iomap_finish_ioend(ioend, error); + completions = iomap_finish_ioend(ioend, error); while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); list_del_init(&ioend->io_list); - iomap_finish_ioend(ioend, error); + completions += iomap_finish_ioend(ioend, error); } } EXPORT_SYMBOL_GPL(iomap_finish_ioends); @@ -1098,6 +1129,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) + return false; return true; } @@ -1199,8 +1242,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; + ioend->io_folios = 0; ioend->io_offset = offset; ioend->io_bio = bio; + ioend->io_sector = sector; return ioend; } @@ -1241,6 +1286,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, return false; if (sector != bio_end_sector(wpc->ioend->io_bio)) return false; + /* + * Limit ioend bio chain lengths to minimise IO completion latency. This + * also prevents long tight loops ending page writeback on all the + * folios in the ioend. + */ + if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + return false; return true; } @@ -1249,29 +1301,29 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, * first; otherwise finish off the current ioend and start another. */ static void -iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, +iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, struct iomap_page *iop, struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct list_head *iolist) { - sector_t sector = iomap_sector(&wpc->iomap, offset); + sector_t sector = iomap_sector(&wpc->iomap, pos); unsigned len = i_blocksize(inode); - unsigned poff = offset & (PAGE_SIZE - 1); + size_t poff = offset_in_folio(folio, pos); - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); + wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); } - if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { + if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - __bio_add_page(wpc->ioend->io_bio, page, len, poff); + bio_add_folio(wpc->ioend->io_bio, folio, len, poff); } if (iop) atomic_add(len, &iop->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, page, len); + wbc_account_cgroup_owner(wbc, &folio->page, len); } /* @@ -1293,44 +1345,45 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, - struct page *page, u64 end_offset) + struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, page); + struct iomap_page *iop = iomap_page_create(inode, folio); struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); - u64 file_offset; /* file offset of page */ + unsigned nblocks = i_blocks_per_folio(inode, folio); + u64 pos = folio_pos(folio); int error = 0, count = 0, i; LIST_HEAD(submit_list); WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); /* - * Walk through the page to find areas to write back. If we run off the - * end of the current map or find the current map invalid, grab a new - * one. + * Walk through the folio to find areas to write back. If we + * run off the end of the current map or find the current map + * invalid, grab a new one. */ - for (i = 0, file_offset = page_offset(page); - i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; - i++, file_offset += len) { + for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { if (iop && !test_bit(i, iop->uptodate)) continue; - error = wpc->ops->map_blocks(wpc, inode, file_offset); + error = wpc->ops->map_blocks(wpc, inode, pos); if (error) break; if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) continue; if (wpc->iomap.type == IOMAP_HOLE) continue; - iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, + iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, &submit_list); count++; } + if (count) + wpc->ioend->io_folios++; WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); - WARN_ON_ONCE(!PageLocked(page)); - WARN_ON_ONCE(PageWriteback(page)); - WARN_ON_ONCE(PageDirty(page)); + WARN_ON_ONCE(!folio_test_locked(folio)); + WARN_ON_ONCE(folio_test_writeback(folio)); + WARN_ON_ONCE(folio_test_dirty(folio)); /* * We cannot cancel the ioend directly here on error. We may have @@ -1345,17 +1398,17 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * won't be affected by I/O completion and we must unlock it * now. */ - if (wpc->ops->discard_page) - wpc->ops->discard_page(page, file_offset); + if (wpc->ops->discard_folio) + wpc->ops->discard_folio(folio, pos); if (!count) { - ClearPageUptodate(page); - unlock_page(page); + folio_clear_uptodate(folio); + folio_unlock(folio); goto done; } } - set_page_writeback(page); - unlock_page(page); + folio_start_writeback(folio); + folio_unlock(folio); /* * Preserve the original error if there was one; catch @@ -1376,9 +1429,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * with a partial page truncate on a sub-page block sized filesystem. */ if (!count) - end_page_writeback(page); + folio_end_writeback(folio); done: - mapping_set_error(page->mapping, error); + mapping_set_error(folio->mapping, error); return error; } @@ -1392,16 +1445,15 @@ done: static int iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) { + struct folio *folio = page_folio(page); struct iomap_writepage_ctx *wpc = data; - struct inode *inode = page->mapping->host; - pgoff_t end_index; - u64 end_offset; - loff_t offset; + struct inode *inode = folio->mapping->host; + u64 end_pos, isize; - trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); + trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); /* - * Refuse to write the page out if we're called from reclaim context. + * Refuse to write the folio out if we're called from reclaim context. * * This avoids stack overflows when called from deeply used stacks in * random callers for direct reclaim or memcg reclaim. We explicitly @@ -1415,10 +1467,10 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) goto redirty; /* - * Is this page beyond the end of the file? + * Is this folio beyond the end of the file? * - * The page index is less than the end_index, adjust the end_offset - * to the highest offset that this page should represent. + * The folio index is less than the end_index, adjust the end_pos + * to the highest offset that this folio should represent. * ----------------------------------------------------- * | file mapping | <EOF> | * ----------------------------------------------------- @@ -1427,11 +1479,9 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * | desired writeback range | see else | * ---------------------------------^------------------| */ - offset = i_size_read(inode); - end_index = offset >> PAGE_SHIFT; - if (page->index < end_index) - end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT; - else { + isize = i_size_read(inode); + end_pos = folio_pos(folio) + folio_size(folio); + if (end_pos > isize) { /* * Check whether the page to write out is beyond or straddles * i_size or not. @@ -1443,7 +1493,8 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * | | Straddles | * ---------------------------------^-----------|--------| */ - unsigned offset_into_page = offset & (PAGE_SIZE - 1); + size_t poff = offset_in_folio(folio, isize); + pgoff_t end_index = isize >> PAGE_SHIFT; /* * Skip the page if it's fully outside i_size, e.g. due to a @@ -1462,8 +1513,8 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * checking if the page is totally beyond i_size or if its * offset is just equal to the EOF. */ - if (page->index > end_index || - (page->index == end_index && offset_into_page == 0)) + if (folio->index > end_index || + (folio->index == end_index && poff == 0)) goto redirty; /* @@ -1474,17 +1525,15 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ - zero_user_segment(page, offset_into_page, PAGE_SIZE); - - /* Adjust the end_offset to the end of file */ - end_offset = offset; + folio_zero_segment(folio, poff, folio_size(folio)); + end_pos = isize; } - return iomap_writepage_map(wpc, wbc, inode, page, end_offset); + return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); redirty: - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); return 0; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b4dc51063d36..03ea367df19a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/iomap.h> #include <linux/backing-dev.h> #include <linux/uio.h> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4..5b9408e3b370 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -484,22 +484,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); - spin_lock(&commit_transaction->t_handle_lock); - while (atomic_read(&commit_transaction->t_updates)) { - DEFINE_WAIT(wait); + // waits for any t_updates to finish + jbd2_journal_wait_updates(journal); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&commit_transaction->t_updates)) { - spin_unlock(&commit_transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - write_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); commit_transaction->t_state = T_SWITCH; write_unlock(&journal->j_state_lock); @@ -817,7 +804,7 @@ start_journal_io: commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); - /* + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record @@ -1170,7 +1157,7 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 1); + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 35302bc192eb..c2cf74b01ddb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -757,6 +757,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); + jbd2_journal_lock_updates(journal); return 0; } @@ -768,8 +769,9 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); */ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { + jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 0); + journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) @@ -1210,7 +1212,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE_DATA(inode); + journal_t *journal = pde_data(inode); struct jbd2_stats_proc_session *s; int rc, size; @@ -1285,6 +1287,8 @@ static int jbd2_min_tag_size(void) /** * jbd2_journal_shrink_scan() + * @shrink: shrinker to work on + * @sc: reclaim request to process * * Scan the checkpointed buffer on the checkpoint list and release the * journal_head. @@ -1310,6 +1314,8 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, /** * jbd2_journal_shrink_count() + * @shrink: shrinker to work on + * @sc: reclaim request to process * * Count the number of checkpoint buffers on the checkpoint list. */ @@ -2970,6 +2976,7 @@ struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) jbd_unlock_bh_journal_head(bh); return jh; } +EXPORT_SYMBOL(jbd2_journal_grab_journal_head); static void __journal_remove_journal_head(struct buffer_head *bh) { @@ -3022,6 +3029,7 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_unlock_bh_journal_head(bh); } } +EXPORT_SYMBOL(jbd2_journal_put_journal_head); /* * Initialize jbd inode head diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a3caedd2285..8e2f8275a253 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -449,7 +449,7 @@ repeat: } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. + * use and add the handle to the running transaction. */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; @@ -836,6 +836,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) } EXPORT_SYMBOL(jbd2_journal_restart); +/* + * Waits for any outstanding t_updates to finish. + * This is called with write j_state_lock held. + */ +void jbd2_journal_wait_updates(journal_t *journal) +{ + transaction_t *commit_transaction = journal->j_running_transaction; + + if (!commit_transaction) + return; + + spin_lock(&commit_transaction->t_handle_lock); + while (atomic_read(&commit_transaction->t_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&commit_transaction->t_updates)) { + spin_unlock(&commit_transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); +} + /** * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -863,27 +892,9 @@ void jbd2_journal_lock_updates(journal_t *journal) write_lock(&journal->j_state_lock); } - /* Wait until there are no running updates */ - while (1) { - transaction_t *transaction = journal->j_running_transaction; - - if (!transaction) - break; + /* Wait until there are no running t_updates */ + jbd2_journal_wait_updates(journal); - spin_lock(&transaction->t_handle_lock); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (!atomic_read(&transaction->t_updates)) { - spin_unlock(&transaction->t_handle_lock); - finish_wait(&journal->j_wait_updates, &wait); - break; - } - spin_unlock(&transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_updates, &wait); - write_lock(&journal->j_state_lock); - } write_unlock(&journal->j_state_lock); /* diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 2b4d5013dc5d..6da92ecaf66d 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -161,5 +161,5 @@ static int jffs2_garbage_collect_thread(void *_c) spin_lock(&c->erase_completion_lock); c->gc_task = NULL; spin_unlock(&c->erase_completion_lock); - complete_and_exit(&c->gc_thread_exit, 0); + kthread_complete_and_exit(&c->gc_thread_exit, 0); } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 4fc8cd698d1a..bd7d58d27bfc 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -136,20 +136,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; - pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) - return -ENOMEM; - *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; @@ -160,7 +155,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) - goto out_page; + goto out_err; mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); @@ -190,7 +185,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = PTR_ERR(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); if (f->metadata) { @@ -205,7 +200,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } jffs2_complete_reservation(c); inode->i_size = pageofs; @@ -213,6 +208,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, } /* + * While getting a page and reading data in, lock c->alloc_sem until + * the page is Uptodate. Otherwise GC task may attempt to read the same + * page in read_cache_page(), which causes a deadlock. + */ + mutex_lock(&c->alloc_sem); + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) { + ret = -ENOMEM; + goto release_sem; + } + *pagep = pg; + + /* * Read in the page if it wasn't already present. Cannot optimize away * the whole page write case until jffs2_write_end can handle the * case of a short-copy. @@ -221,15 +229,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); mutex_unlock(&f->sem); - if (ret) - goto out_page; + if (ret) { + unlock_page(pg); + put_page(pg); + goto release_sem; + } } jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); - return ret; -out_page: - unlock_page(pg); - put_page(pg); +release_sem: + mutex_unlock(&c->alloc_sem); +out_err: return ret; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8e0a1378a4b1..e6d9772ddb4c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,6 @@ #include "kernfs-internal.h" -DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ @@ -26,7 +25,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ static bool kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_rwsem); + lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return atomic_read(&kn->active) >= 0; } @@ -457,14 +456,15 @@ void kernfs_put_active(struct kernfs_node *kn) * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) + __releases(&kernfs_root(kn)->kernfs_rwsem) + __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); - lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -483,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn) kernfs_drain_open_files(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); } /** @@ -718,11 +718,12 @@ err_unlock: int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; + struct kernfs_root *root = kernfs_root(parent); struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); @@ -753,7 +754,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr->ia_mtime = ps_iattr->ia_ctime; } - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -767,7 +768,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0; out_unlock: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; } @@ -788,7 +789,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; - lockdep_assert_held(&kernfs_rwsem); + lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", @@ -820,7 +821,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, size_t len; char *p, *name; - lockdep_assert_held_read(&kernfs_rwsem); + lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ spin_lock_irq(&kernfs_rename_lock); @@ -859,11 +860,12 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return kn; } @@ -883,11 +885,12 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return kn; } @@ -912,6 +915,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); + init_rwsem(&root->kernfs_rwsem); INIT_LIST_HEAD(&root->supers); /* @@ -957,7 +961,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, */ void kernfs_destroy_root(struct kernfs_root *root) { - kernfs_remove(root->kn); /* will also free @root */ + /* + * kernfs_remove holds kernfs_rwsem from the root so the root + * shouldn't be freed during the operation. + */ + kernfs_get(root->kn); + kernfs_remove(root->kn); + kernfs_put(root->kn); /* will also free @root */ } /** @@ -1035,6 +1045,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; + struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; @@ -1046,18 +1057,19 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) /* If the kernfs parent node has changed discard and * proceed to ->lookup. */ - down_read(&kernfs_rwsem); spin_lock(&dentry->d_lock); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { + spin_unlock(&dentry->d_lock); + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_dir_changed(parent, dentry)) { - spin_unlock(&dentry->d_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } - } - spin_unlock(&dentry->d_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); + } else + spin_unlock(&dentry->d_lock); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. @@ -1066,7 +1078,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) } kn = kernfs_dentry_node(dentry); - down_read(&kernfs_rwsem); + root = kernfs_root(kn); + down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) @@ -1085,10 +1098,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 1; out_bad: - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } @@ -1102,10 +1115,12 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; + struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; - down_read(&kernfs_rwsem); + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; @@ -1116,7 +1131,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, * create a negative. */ if (!kernfs_active(kn)) { - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); @@ -1131,7 +1146,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); @@ -1254,7 +1269,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, { struct rb_node *rbn; - lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1289,8 +1304,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; + struct kernfs_root *root = kernfs_root(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { @@ -1304,14 +1320,14 @@ void kernfs_activate(struct kernfs_node *kn) pos->flags |= KERNFS_ACTIVATED; } - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; - lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * Short-circuit if non-root @kn has already finished removal. @@ -1381,9 +1397,11 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - down_write(&kernfs_rwsem); + struct kernfs_root *root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); __kernfs_remove(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); } /** @@ -1469,8 +1487,9 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn) bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; + struct kernfs_root *root = kernfs_root(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* @@ -1498,9 +1517,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); schedule(); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1513,7 +1532,7 @@ bool kernfs_remove_self(struct kernfs_node *kn) */ kernfs_unbreak_active_protection(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; } @@ -1530,6 +1549,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", @@ -1537,13 +1557,14 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - down_write(&kernfs_rwsem); + root = kernfs_root(parent); + down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) __kernfs_remove(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); if (kn) return 0; @@ -1562,6 +1583,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; + struct kernfs_root *root; const char *old_name = NULL; int error; @@ -1569,7 +1591,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (!kn->parent) return -EINVAL; - down_write(&kernfs_rwsem); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || @@ -1623,7 +1646,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, error = 0; out: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return error; } @@ -1694,11 +1717,14 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; + struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; - down_read(&kernfs_rwsem); + + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1715,12 +1741,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); } - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 60e2a86c535e..9414a7a60a9f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -847,6 +847,7 @@ static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; + struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); @@ -859,8 +860,9 @@ repeat: kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); + root = kernfs_root(kn); /* kick fsnotify */ - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; @@ -898,7 +900,7 @@ repeat: iput(inode); } - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index c0eae1725435..3d783d80f5da 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -99,10 +99,11 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; + struct kernfs_root *root = kernfs_root(kn); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; } @@ -111,12 +112,14 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root; int error; if (!kn) return -EINVAL; - down_write(&kernfs_rwsem); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -129,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(&init_user_ns, inode, iattr); out: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return error; } @@ -184,13 +187,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root = kernfs_root(kn); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); generic_fillattr(&init_user_ns, inode, stat); spin_unlock(&inode->i_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } @@ -274,19 +278,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; + struct kernfs_root *root; int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; + root = kernfs_root(kn); - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); ret = generic_permission(&init_user_ns, inode, mask); spin_unlock(&inode->i_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return ret; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f2f909d09f52..cfa79715fc1a 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root; @@ -255,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ - down_read(&kernfs_rwsem); + down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - up_read(&kernfs_rwsem); + up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; @@ -334,6 +335,7 @@ int kernfs_get_tree(struct fs_context *fc) if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = kfc->root; kfc->new_sb_created = true; @@ -344,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_add(&info->node, &info->root->supers); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); } fc->root = dget(sb->s_root); @@ -371,10 +373,11 @@ void kernfs_free_fs_context(struct fs_context *fc) void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = info->root; - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_del(&info->node); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 19a6c71c6ff5..0ab13824822f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -113,11 +113,12 @@ static int kernfs_getlink(struct inode *inode, char *path) struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; + struct kernfs_root *root = kernfs_root(parent); int error; - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return error; } diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c index b014f4638610..c03eba090368 100644 --- a/fs/ksmbd/asn1.c +++ b/fs/ksmbd/asn1.c @@ -21,101 +21,11 @@ #include "ksmbd_spnego_negtokeninit.asn1.h" #include "ksmbd_spnego_negtokentarg.asn1.h" -#define SPNEGO_OID_LEN 7 #define NTLMSSP_OID_LEN 10 -#define KRB5_OID_LEN 7 -#define KRB5U2U_OID_LEN 8 -#define MSKRB5_OID_LEN 7 -static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; -static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; -static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; -static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; -static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01, 0x82, 0x37, 0x02, 0x02, 0x0a }; -static bool -asn1_subid_decode(const unsigned char **begin, const unsigned char *end, - unsigned long *subid) -{ - const unsigned char *ptr = *begin; - unsigned char ch; - - *subid = 0; - - do { - if (ptr >= end) - return false; - - ch = *ptr++; - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - - *begin = ptr; - return true; -} - -static bool asn1_oid_decode(const unsigned char *value, size_t vlen, - unsigned long **oid, size_t *oidlen) -{ - const unsigned char *iptr = value, *end = value + vlen; - unsigned long *optr; - unsigned long subid; - - vlen += 1; - if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long)) - goto fail_nullify; - - *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL); - if (!*oid) - return false; - - optr = *oid; - - if (!asn1_subid_decode(&iptr, end, &subid)) - goto fail; - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *oidlen = 2; - optr += 2; - - while (iptr < end) { - if (++(*oidlen) > vlen) - goto fail; - - if (!asn1_subid_decode(&iptr, end, optr++)) - goto fail; - } - return true; - -fail: - kfree(*oid); -fail_nullify: - *oid = NULL; - return false; -} - -static bool oid_eq(unsigned long *oid1, unsigned int oid1len, - unsigned long *oid2, unsigned int oid2len) -{ - if (oid1len != oid2len) - return false; - - return memcmp(oid1, oid2, oid1len) == 0; -} - int ksmbd_decode_negTokenInit(unsigned char *security_blob, int length, struct ksmbd_conn *conn) @@ -252,26 +162,18 @@ int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { - unsigned long *oid; - size_t oidlen; - int err = 0; - - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) { - err = -EBADMSG; - goto out; - } + enum OID oid; - if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN)) - err = -EBADMSG; - kfree(oid); -out: - if (err) { + oid = look_up_OID(value, vlen); + if (oid != OID_spnego) { char buf[50]; sprint_oid(value, vlen, buf, sizeof(buf)); ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; } - return err; + + return 0; } int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, @@ -279,37 +181,31 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, size_t vlen) { struct ksmbd_conn *conn = context; - unsigned long *oid; - size_t oidlen; + enum OID oid; int mech_type; - char buf[50]; - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) - goto fail; - - if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) + oid = look_up_OID(value, vlen); + if (oid == OID_ntlmssp) { mech_type = KSMBD_AUTH_NTLMSSP; - else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) + } else if (oid == OID_mskrb5) { mech_type = KSMBD_AUTH_MSKRB5; - else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) + } else if (oid == OID_krb5) { mech_type = KSMBD_AUTH_KRB5; - else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) + } else if (oid == OID_krb5u2u) { mech_type = KSMBD_AUTH_KRB5U2U; - else - goto fail; + } else { + char buf[50]; + + sprint_oid(value, vlen, buf, sizeof(buf)); + ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; + } conn->auth_mechs |= mech_type; if (conn->preferred_auth_mech == 0) conn->preferred_auth_mech = mech_type; - kfree(oid); return 0; - -fail: - kfree(oid); - sprint_oid(value, vlen, buf, sizeof(buf)); - ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); - return -EBADMSG; } int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 3503b1c48cb4..911444d21267 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -29,6 +29,7 @@ #include "mgmt/user_config.h" #include "crypto_ctx.h" #include "transport_ipc.h" +#include "../smbfs_common/arc4.h" /* * Fixed format data defining GSS header and fixed string @@ -215,7 +216,7 @@ out: * Return: 0 on success, error number on error */ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name) + int blen, char *domain_name, char *cryptkey) { char ntlmv2_hash[CIFS_ENCPWD_SIZE]; char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE]; @@ -256,7 +257,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, goto out; } - memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); + memcpy(construct, cryptkey, CIFS_CRYPTO_KEY_SIZE); memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen); rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len); @@ -295,7 +296,8 @@ out: * Return: 0 on success, error number on error */ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess) { char *domain_name; unsigned int nt_off, dn_off; @@ -324,7 +326,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, /* TODO : use domain name that imported from configuration file */ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, - dn_len, true, sess->conn->local_nls); + dn_len, true, conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); @@ -333,8 +335,31 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, domain_name); ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off), nt_len - CIFS_ENCPWD_SIZE, - domain_name); + domain_name, conn->ntlmssp.cryptkey); kfree(domain_name); + + /* The recovered secondary session key */ + if (conn->ntlmssp.client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) { + struct arc4_ctx *ctx_arc4; + unsigned int sess_key_off, sess_key_len; + + sess_key_off = le32_to_cpu(authblob->SessionKey.BufferOffset); + sess_key_len = le16_to_cpu(authblob->SessionKey.Length); + + if (blob_len < (u64)sess_key_off + sess_key_len) + return -EINVAL; + + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); + if (!ctx_arc4) + return -ENOMEM; + + cifs_arc4_setkey(ctx_arc4, sess->sess_key, + SMB2_NTLMV2_SESSKEY_SIZE); + cifs_arc4_crypt(ctx_arc4, sess->sess_key, + (char *)authblob + sess_key_off, sess_key_len); + kfree_sensitive(ctx_arc4); + } + return ret; } @@ -347,7 +372,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, * */ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn) { if (blob_len < sizeof(struct negotiate_message)) { ksmbd_debug(AUTH, "negotiate blob len %d too small\n", @@ -361,7 +386,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, return -EINVAL; } - sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); + conn->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); return 0; } @@ -375,14 +400,14 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, */ unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess) + struct ksmbd_conn *conn) { struct target_info *tinfo; wchar_t *name; __u8 *target_name; unsigned int flags, blob_off, blob_len, type, target_info_len = 0; int len, uni_len, conv_len; - int cflags = sess->ntlmssp.client_flags; + int cflags = conn->ntlmssp.client_flags; memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8); chgblob->MessageType = NtLmChallenge; @@ -403,10 +428,13 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, if (cflags & NTLMSSP_REQUEST_TARGET) flags |= NTLMSSP_REQUEST_TARGET; - if (sess->conn->use_spnego && + if (conn->use_spnego && (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (cflags & NTLMSSP_NEGOTIATE_KEY_XCH) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + chgblob->NegotiateFlags = cpu_to_le32(flags); len = strlen(ksmbd_netbios_name()); name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL); @@ -414,7 +442,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, return -ENOMEM; conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len, - sess->conn->local_nls); + conn->local_nls); if (conv_len < 0 || conv_len > len) { kfree(name); return -EINVAL; @@ -430,8 +458,8 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off); /* Initialize random conn challenge */ - get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64)); - memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey, + get_random_bytes(conn->ntlmssp.cryptkey, sizeof(__u64)); + memcpy(chgblob->Challenge, conn->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); /* Add Target Information to security buffer */ diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 9c2d4badd05d..95629651cf26 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -38,16 +38,16 @@ struct kvec; int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); -int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf); int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name); + int blen, char *domain_name, char *cryptkey); int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess); int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn); unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess); + struct ksmbd_conn *conn); int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len); int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 83a94d0bb480..208d2cff7bd3 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,6 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; + conn->outstanding_credits = 1; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); @@ -386,17 +387,24 @@ out: static void stop_sessions(void) { struct ksmbd_conn *conn; + struct ksmbd_transport *t; again: read_lock(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { struct task_struct *task; - task = conn->transport->handler; + t = conn->transport; + task = t->handler; if (task) ksmbd_debug(CONN, "Stop session handler %s/%d\n", task->comm, task_pid_nr(task)); conn->status = KSMBD_SESS_EXITING; + if (t->ops->shutdown) { + read_unlock(&conn_list_lock); + t->ops->shutdown(t); + read_lock(&conn_list_lock); + } } read_unlock(&conn_list_lock); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e5403c587a58..7a59aacb5daa 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -61,8 +61,8 @@ struct ksmbd_conn { atomic_t req_running; /* References which are made for this Server object*/ atomic_t r_count; - unsigned short total_credits; - unsigned short max_credits; + unsigned int total_credits; + unsigned int outstanding_credits; spinlock_t credits_lock; wait_queue_head_t req_running_q; /* Lock to protect requests list*/ @@ -72,12 +72,7 @@ struct ksmbd_conn { int connection_type; struct ksmbd_stats stats; char ClientGUID[SMB2_CLIENT_GUID_SIZE]; - union { - /* pending trans request table */ - struct trans_state *recent_trans; - /* Used by ntlmssp */ - char *ntlmssp_cryptkey; - }; + struct ntlmssp_auth ntlmssp; spinlock_t llist_lock; struct list_head lock_list; @@ -122,6 +117,7 @@ struct ksmbd_conn_ops { struct ksmbd_transport_ops { int (*prepare)(struct ksmbd_transport *t); void (*disconnect)(struct ksmbd_transport *t); + void (*shutdown)(struct ksmbd_transport *t); int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size); int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index c6718a05d347..71bfb7de4472 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -103,6 +103,8 @@ struct ksmbd_startup_request { * we set the SPARSE_FILES bit (0x40). */ __u32 sub_auth[3]; /* Subauth value for Security ID */ + __u32 smb2_max_credits; /* MAX credits */ + __u32 reserved[128]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; @@ -113,7 +115,7 @@ struct ksmbd_startup_request { * IPC request to shutdown ksmbd server. */ struct ksmbd_shutdown_request { - __s32 reserved; + __s32 reserved[16]; }; /* @@ -122,6 +124,7 @@ struct ksmbd_shutdown_request { struct ksmbd_login_request { __u32 handle; __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -135,6 +138,7 @@ struct ksmbd_login_response { __u16 status; __u16 hash_sz; /* hash size */ __s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -143,6 +147,7 @@ struct ksmbd_login_response { struct ksmbd_share_config_request { __u32 handle; __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -157,6 +162,7 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; + __u32 reserved[128]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; @@ -187,6 +193,7 @@ struct ksmbd_tree_connect_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; __s8 share[KSMBD_REQ_MAX_SHARE_NAME]; __s8 peer_addr[64]; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -196,6 +203,7 @@ struct ksmbd_tree_connect_response { __u32 handle; __u16 status; __u16 connection_flags; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -204,6 +212,7 @@ struct ksmbd_tree_connect_response { struct ksmbd_tree_disconnect_request { __u64 session_id; /* session id */ __u64 connect_id; /* tree connection id */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -212,6 +221,7 @@ struct ksmbd_tree_disconnect_request { struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ __u32 account_flags; + __u32 reserved[16]; /* Reserved room */ }; /* diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index 1019d3677d55..279d00feff21 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -67,3 +67,13 @@ int ksmbd_anonymous_user(struct ksmbd_user *user) return 1; return 0; } + +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2) +{ + if (strcmp(u1->name, u2->name)) + return false; + if (memcmp(u1->passkey, u2->passkey, u1->passkey_sz)) + return false; + + return true; +} diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index aff80b029579..6a44109617f1 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -64,4 +64,5 @@ struct ksmbd_user *ksmbd_login_user(const char *account); struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); #endif /* __USER_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 82289c3cbd2b..e241f16a3851 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -45,7 +45,6 @@ struct ksmbd_session { int state; __u8 *Preauth_HashValue; - struct ntlmssp_auth ntlmssp; char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 50d0b1022289..4a9460153b59 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -289,7 +289,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; - int ret; + int ret = 0; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -326,21 +326,27 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", credit_charge, calc_credit_num); return 1; - } else if (credit_charge > conn->max_credits) { + } else if (credit_charge > conn->vals->max_credits) { ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } spin_lock(&conn->credits_lock); - if (credit_charge <= conn->total_credits) { - conn->total_credits -= credit_charge; - ret = 0; - } else { + if (credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", credit_charge, conn->total_credits); ret = 1; } + + if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", + credit_charge, conn->outstanding_credits); + ret = 1; + } else + conn->outstanding_credits += credit_charge; + spin_unlock(&conn->credits_lock); + return ret; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 02a44d28bdaf..ab23da2120b9 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -19,6 +19,7 @@ static struct smb_version_values smb21_server_values = { .max_read_size = SMB21_DEFAULT_IOSIZE, .max_write_size = SMB21_DEFAULT_IOSIZE, .max_trans_size = SMB21_DEFAULT_IOSIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -44,6 +45,7 @@ static struct smb_version_values smb30_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -70,6 +72,7 @@ static struct smb_version_values smb302_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -96,6 +99,7 @@ static struct smb_version_values smb311_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -197,7 +201,6 @@ void init_smb2_1_server(struct ksmbd_conn *conn) conn->ops = &smb2_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -215,7 +218,6 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -240,7 +242,6 @@ void init_smb3_02_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -265,7 +266,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->ops = &smb3_11_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -304,3 +304,11 @@ void init_smb2_max_trans_size(unsigned int sz) smb302_server_values.max_trans_size = sz; smb311_server_values.max_trans_size = sz; } + +void init_smb2_max_credits(unsigned int sz) +{ + smb21_server_values.max_credits = sz; + smb30_server_values.max_credits = sz; + smb302_server_values.max_credits = sz; + smb311_server_values.max_credits = sz; +} diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b8b3a4c28b74..67e8e28e3fc3 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -299,16 +299,15 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested; + unsigned short credits_requested, aux_max; unsigned short credit_charge, credits_granted = 0; - unsigned short aux_max, aux_credits; if (work->send_no_response) return 0; hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits > conn->max_credits) { + if (conn->total_credits > conn->vals->max_credits) { hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); return -EINVAL; @@ -316,6 +315,14 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) credit_charge = max_t(unsigned short, le16_to_cpu(req_hdr->CreditCharge), 1); + if (credit_charge > conn->total_credits) { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + return -EINVAL; + } + + conn->total_credits -= credit_charge; + conn->outstanding_credits -= credit_charge; credits_requested = max_t(unsigned short, le16_to_cpu(req_hdr->CreditRequest), 1); @@ -325,16 +332,14 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) * TODO: Need to adjuct CreditRequest value according to * current cpu load */ - aux_credits = credits_requested - 1; if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; + aux_max = 1; else - aux_max = conn->max_credits - credit_charge; - aux_credits = min_t(unsigned short, aux_credits, aux_max); - credits_granted = credit_charge + aux_credits; + aux_max = conn->vals->max_credits - credit_charge; + credits_granted = min_t(unsigned short, credits_requested, aux_max); - if (conn->max_credits - conn->total_credits < credits_granted) - credits_granted = conn->max_credits - + if (conn->vals->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->vals->max_credits - conn->total_credits; conn->total_credits += credits_granted; @@ -610,16 +615,14 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) /** * smb2_get_name() - get filename string from on the wire smb format - * @share: ksmbd_share_config pointer * @src: source buffer * @maxlen: maxlen of source string - * @nls_table: nls_table pointer + * @local_nls: nls_table pointer * * Return: matching converted filename on success, otherwise error ptr */ static char * -smb2_get_name(struct ksmbd_share_config *share, const char *src, - const int maxlen, struct nls_table *local_nls) +smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) { char *name; @@ -1303,7 +1306,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->conn); if (rc) return rc; @@ -1313,7 +1316,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, memset(chgblob, 0, sizeof(struct challenge_message)); if (!work->conn->use_spnego) { - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) return -ENOMEM; @@ -1329,7 +1332,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, return -ENOMEM; chgblob = (struct challenge_message *)neg_blob; - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) { rc = -ENOMEM; goto out; @@ -1450,60 +1453,62 @@ static int ntlm_authenticate(struct ksmbd_work *work) ksmbd_free_user(user); return 0; } - ksmbd_free_user(sess->user); - } - sess->user = user; - if (user_guest(sess->user)) { - if (conn->sign) { - ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_free_user(user); return -EPERM; } + ksmbd_free_user(user); + } else { + sess->user = user; + } + if (user_guest(sess->user)) { rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE; } else { struct authenticate_message *authblob; authblob = user_authblob(conn, req); sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess); + rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); ksmbd_debug(SMB, "authentication failed\n"); return -EPERM; } + } - /* - * If session state is SMB2_SESSION_VALID, We can assume - * that it is reauthentication. And the user/password - * has been verified, so return it here. - */ - if (sess->state == SMB2_SESSION_VALID) { - if (conn->binding) - goto binding_session; - return 0; - } + /* + * If session state is SMB2_SESSION_VALID, We can assume + * that it is reauthentication. And the user/password + * has been verified, so return it here. + */ + if (sess->state == SMB2_SESSION_VALID) { + if (conn->binding) + goto binding_session; + return 0; + } - if ((conn->sign || server_conf.enforced_signing) || - (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) - sess->sign = true; + if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE && + (conn->sign || server_conf.enforced_signing)) || + (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) + sess->sign = true; - if (smb3_encryption_negotiated(conn) && - !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { - rc = conn->ops->generate_encryptionkey(sess); - if (rc) { - ksmbd_debug(SMB, - "SMB3 encryption key generation failed\n"); - return -EINVAL; - } - sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; - /* - * signing is disable if encryption is enable - * on this session - */ - sess->sign = false; + if (smb3_encryption_negotiated(conn) && + !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + rc = conn->ops->generate_encryptionkey(sess); + if (rc) { + ksmbd_debug(SMB, + "SMB3 encryption key generation failed\n"); + return -EINVAL; } + sess->enc = true; + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + /* + * signing is disable if encryption is enable + * on this session + */ + sess->sign = false; } binding_session: @@ -2057,9 +2062,6 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - /* Got a valid session, set connection state */ - WARN_ON(sess->conn != conn); - /* setting CifsExiting here may race with start_tcp_sess */ ksmbd_conn_set_need_reconnect(work); ksmbd_close_session_fds(work); @@ -2530,8 +2532,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - name = smb2_get_name(share, - req->Buffer, + name = smb2_get_name(req->Buffer, le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { @@ -2687,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work) (struct create_posix *)context; if (le16_to_cpu(context->DataOffset) + le32_to_cpu(context->DataLength) < - sizeof(struct create_posix)) { + sizeof(struct create_posix) - 4) { rc = -EINVAL; goto err_out1; } @@ -3392,7 +3393,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) * @conn: connection instance * @info_level: smb information level * @d_info: structure included variables for query dir - * @user_ns: user namespace * @ksmbd_kstat: ksmbd wrapper of dirent stat information * * if directory has many entries, find first can't read it fully. @@ -3422,9 +3422,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level); - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, - KSMBD_DIR_INFO_ALIGNMENT); + struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); + d_info->last_entry_off_align = next_entry_offset - struct_sz; if (next_entry_offset > d_info->out_buf_len) { d_info->out_buf_len = 0; @@ -3976,6 +3976,7 @@ int smb2_query_dir(struct ksmbd_work *work) ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); @@ -4018,6 +4019,7 @@ err_out2: * buffer_check_err() - helper function to check buffer errors * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length + * @rsp_org: base response buffer pointer in case of chained response * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error @@ -5398,8 +5400,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; } - new_name = smb2_get_name(share, - file_info->FileName, + new_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(new_name)) { @@ -5510,8 +5511,7 @@ static int smb2_create_link(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - link_name = smb2_get_name(share, - file_info->FileName, + link_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) { @@ -5849,7 +5849,7 @@ static int set_file_mode_info(struct ksmbd_file *fp, * smb2_set_info_file() - handler for smb2 set info command * @work: smb work containing set info command buffer * @fp: ksmbd_file pointer - * @info_class: smb2 set info class + * @req: request buffer pointer * @share: ksmbd_share_config pointer * * Return: 0 on success, otherwise error @@ -6121,25 +6121,46 @@ out: return err; } -static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, - struct smb2_read_req *req, void *data_buf, - size_t length) +static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, + struct smb2_buffer_desc_v1 *desc, + __le32 Channel, + __le16 ChannelInfoOffset, + __le16 ChannelInfoLength) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - int err; + unsigned int i, ch_count; if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) + Channel != SMB2_CHANNEL_RDMA_V1) return -EINVAL; - if (req->ReadChannelInfoOffset == 0 || - le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc)) + ch_count = le16_to_cpu(ChannelInfoLength) / sizeof(*desc); + if (ksmbd_debug_types & KSMBD_DEBUG_RDMA) { + for (i = 0; i < ch_count; i++) { + pr_info("RDMA r/w request %#x: token %#x, length %#x\n", + i, + le32_to_cpu(desc[i].token), + le32_to_cpu(desc[i].length)); + } + } + if (ch_count != 1) { + ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", + ch_count); return -EINVAL; + } work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); + (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); work->remote_key = le32_to_cpu(desc->token); + return 0; +} + +static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, + struct smb2_read_req *req, void *data_buf, + size_t length) +{ + struct smb2_buffer_desc_v1 *desc = + (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; + int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, le32_to_cpu(desc->token), @@ -6162,7 +6183,7 @@ int smb2_read(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_read_req *req; struct smb2_read_rsp *rsp; - struct ksmbd_file *fp; + struct ksmbd_file *fp = NULL; loff_t offset; size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; @@ -6176,6 +6197,24 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || + req->Channel == SMB2_CHANNEL_RDMA_V1) { + unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); + + if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { + err = -EINVAL; + goto out; + } + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + ((char *)req + ch_offset), + req->Channel, + req->ReadChannelInfoOffset, + req->ReadChannelInfoLength); + if (err) + goto out; + } + fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), le64_to_cpu(req->PersistentFileId)); if (!fp) { @@ -6361,21 +6400,6 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) - return -EINVAL; - - if (req->Length != 0 || req->DataOffset != 0) - return -EINVAL; - - if (req->WriteChannelInfoOffset == 0 || - le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc)) - return -EINVAL; - - work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; @@ -6422,6 +6446,25 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || + req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); + + if (req->Length != 0 || req->DataOffset != 0 || + ch_offset < offsetof(struct smb2_write_req, Buffer)) { + err = -EINVAL; + goto out; + } + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + ((char *)req + ch_offset), + req->Channel, + req->WriteChannelInfoOffset, + req->WriteChannelInfoLength); + if (err) + goto out; + } + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); err = -EACCES; @@ -7243,15 +7286,10 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, struct sockaddr_storage_rsp *sockaddr_storage; unsigned int flags; unsigned long long speed; - struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr; rtnl_lock(); for_each_netdev(&init_net, netdev) { - if (out_buf_len < - nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { - rtnl_unlock(); - return -ENOSPC; - } + bool ipv4_set = false; if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7259,12 +7297,20 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, flags = dev_get_flags(netdev); if (!(flags & IFF_RUNNING)) continue; +ipv6_retry: + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } nii_rsp = (struct network_interface_info_ioctl_rsp *) &rsp->Buffer[nbytes]; nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex); nii_rsp->Capability = 0; + if (netdev->real_num_tx_queues > 1) + nii_rsp->Capability |= cpu_to_le32(RSS_CAPABLE); if (ksmbd_rdma_capable_netdev(netdev)) nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE); @@ -7289,8 +7335,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, nii_rsp->SockAddr_Storage; memset(sockaddr_storage, 0, 128); - if (conn->peer_addr.ss_family == PF_INET || - ipv6_addr_v4mapped(&csin6->sin6_addr)) { + if (!ipv4_set) { struct in_device *idev; sockaddr_storage->Family = cpu_to_le16(INTERNETWORK); @@ -7301,6 +7346,9 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, continue; sockaddr_storage->addr4.IPv4address = idev_ipv4_address(idev); + nbytes += sizeof(struct network_interface_info_ioctl_rsp); + ipv4_set = true; + goto ipv6_retry; } else { struct inet6_dev *idev6; struct inet6_ifaddr *ifa; @@ -7322,9 +7370,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, break; } sockaddr_storage->addr6.ScopeId = 0; + nbytes += sizeof(struct network_interface_info_ioctl_rsp); } - - nbytes += sizeof(struct network_interface_info_ioctl_rsp); } rtnl_unlock(); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 4a3e4339d4c4..725b800c29c8 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -980,6 +980,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn); void init_smb2_max_read_size(unsigned int sz); void init_smb2_max_write_size(unsigned int sz); void init_smb2_max_trans_size(unsigned int sz); +void init_smb2_max_credits(unsigned int sz); bool is_smb2_neg_cmd(struct ksmbd_work *work); bool is_smb2_rsp(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index ef7f42b0290a..9a7e211dbf4f 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -308,14 +308,17 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, for (i = 0; i < 2; i++) { struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; + struct dentry *dentry; if (!dir->dot_dotdot[i]) { /* fill dot entry info */ if (i == 0) { d_info->name = "."; d_info->name_len = 1; + dentry = dir->filp->f_path.dentry; } else { d_info->name = ".."; d_info->name_len = 2; + dentry = dir->filp->f_path.dentry->d_parent; } if (!match_pattern(d_info->name, d_info->name_len, @@ -327,7 +330,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, user_ns, - dir->filp->f_path.dentry->d_parent, + dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); if (rc) diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 50590842b651..e1369b4345a9 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -365,6 +365,7 @@ struct smb_version_values { __u32 max_read_size; __u32 max_write_size; __u32 max_trans_size; + __u32 max_credits; __u32 large_lock_type; __u32 exclusive_lock_type; __u32 shared_lock_type; diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index bd792db32623..6ecf55ea1fed 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/mnt_idmapping.h> #include "smbacl.h" #include "smb_common.h" @@ -274,14 +275,7 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - /* - * Translate raw sid into kuid in the server's user - * namespace. - */ - uid = make_kuid(&init_user_ns, id); - - /* If this is an idmapped mount, apply the idmapping. */ - uid = kuid_from_mnt(user_ns, uid); + uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -291,14 +285,7 @@ static int sid_to_id(struct user_namespace *user_ns, gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - /* - * Translate raw sid into kgid in the server's user - * namespace. - */ - gid = make_kgid(&init_user_ns, id); - - /* If this is an idmapped mount, apply the idmapping. */ - gid = kgid_from_mnt(user_ns, gid); + gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 73e08cad412b..811af3309429 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/namei.h> #include <linux/posix_acl.h> +#include <linux/mnt_idmapping.h> #include "mgmt/tree_connect.h" @@ -216,7 +217,7 @@ static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, kuid_t kuid; /* If this is an idmapped mount, apply the idmapping. */ - kuid = kuid_into_mnt(mnt_userns, pace->e_uid); + kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ return from_kuid(&init_user_ns, kuid); @@ -228,7 +229,7 @@ static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, kgid_t kgid; /* If this is an idmapped mount, apply the idmapping. */ - kgid = kgid_into_mnt(mnt_userns, pace->e_gid); + kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ return from_kgid(&init_user_ns, kgid); diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 1acf1892a466..3ad6881e0f7e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -301,6 +301,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); + if (req->smb2_max_credits) + init_smb2_max_credits(req->smb2_max_credits); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 7e57cbb0bb35..ba5a22bc2e6d 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -34,7 +34,8 @@ #include "smbstatus.h" #include "transport_rdma.h" -#define SMB_DIRECT_PORT 5445 +#define SMB_DIRECT_PORT_IWARP 5445 +#define SMB_DIRECT_PORT_INFINIBAND 445 #define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) @@ -60,6 +61,10 @@ * as defined in [MS-SMBD] 3.1.1.1 * Those may change after a SMB_DIRECT negotiation */ + +/* Set 445 port to SMB Direct port by default */ +static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; + /* The local peer's maximum number of credits to grant to the peer */ static int smb_direct_receive_credit_max = 255; @@ -75,10 +80,18 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 1024 * 1024; +static int smb_direct_max_read_write_size = 524224; static int smb_direct_max_outstanding_rw_ops = 8; +static LIST_HEAD(smb_direct_device_list); +static DEFINE_RWLOCK(smb_direct_device_lock); + +struct smb_direct_device { + struct ib_device *ib_dev; + struct list_head list; +}; + static struct smb_direct_listener { struct rdma_cm_id *cm_id; } smb_direct_listener; @@ -415,6 +428,7 @@ static void free_transport(struct smb_direct_transport *t) if (t->qp) { ib_drain_qp(t->qp); + ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); ib_destroy_qp(t->qp); } @@ -555,6 +569,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; case SMB_DIRECT_MSG_DATA_TRANSFER: { @@ -1438,6 +1453,15 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) free_transport(st); } +static void smb_direct_shutdown(struct ksmbd_transport *t) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + + smb_direct_disconnect_rdma_work(&st->disconnect_work); +} + static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -1581,19 +1605,13 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) pr_err("error at rdma_accept: %d\n", ret); return ret; } - - wait_event_interruptible(t->wait_status, - t->status != SMB_DIRECT_CS_NEW); - if (t->status != SMB_DIRECT_CS_CONNECTED) - return -ENOTCONN; return 0; } -static int smb_direct_negotiate(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) { int ret; struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; recvmsg = get_free_recvmsg(t); if (!recvmsg) @@ -1603,44 +1621,20 @@ static int smb_direct_negotiate(struct smb_direct_transport *t) ret = smb_direct_post_recv(t, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - goto out; + goto out_err; } t->negotiation_requested = false; ret = smb_direct_accept_client(t); if (ret) { pr_err("Can't accept client\n"); - goto out; + goto out_err; } smb_direct_post_recv_credits(&t->post_recv_credits_work.work); - - ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(t->wait_status, - t->negotiation_requested || - t->status == SMB_DIRECT_CS_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) { - ret = ret < 0 ? ret : -ETIMEDOUT; - goto out; - } - - ret = smb_direct_check_recvmsg(recvmsg); - if (ret == -ECONNABORTED) - goto out; - - req = (struct smb_direct_negotiate_req *)recvmsg->packet; - t->max_recv_size = min_t(int, t->max_recv_size, - le32_to_cpu(req->preferred_send_size)); - t->max_send_size = min_t(int, t->max_send_size, - le32_to_cpu(req->max_receive_size)); - t->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); - - ret = smb_direct_send_negotiate_response(t, ret); -out: - if (recvmsg) - put_recvmsg(t, recvmsg); + return 0; +out_err: + put_recvmsg(t, recvmsg); return ret; } @@ -1724,7 +1718,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = 0; + cap->max_rdma_ctxs = + rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * + smb_direct_max_outstanding_rw_ops; return 0; } @@ -1806,6 +1802,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, { int ret; struct ib_qp_init_attr qp_attr; + int pages_per_rw; t->pd = ib_alloc_pd(t->cm_id->device, 0); if (IS_ERR(t->pd)) { @@ -1853,6 +1850,23 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, t->qp = t->cm_id->qp; t->cm_id->event_handler = smb_direct_cm_handler; + pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { + int pages_per_mr, mr_count; + + pages_per_mr = min_t(int, pages_per_rw, + t->cm_id->device->attrs.max_fast_reg_page_list_len); + mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * + atomic_read(&t->rw_avail_ops); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, + IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + if (ret) { + pr_err("failed to init mr pool count %d pages %d\n", + mr_count, pages_per_mr); + goto err; + } + } + return 0; err: if (t->qp) { @@ -1877,6 +1891,49 @@ err: static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_negotiate_req *req; + int ret; + + ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); + ret = wait_event_interruptible_timeout(st->wait_status, + st->negotiation_requested || + st->status == SMB_DIRECT_CS_DISCONNECTED, + SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + return ret < 0 ? ret : -ETIMEDOUT; + + recvmsg = get_first_reassembly(st); + if (!recvmsg) + return -ECONNABORTED; + + ret = smb_direct_check_recvmsg(recvmsg); + if (ret == -ECONNABORTED) + goto out; + + req = (struct smb_direct_negotiate_req *)recvmsg->packet; + st->max_recv_size = min_t(int, st->max_recv_size, + le32_to_cpu(req->preferred_send_size)); + st->max_send_size = min_t(int, st->max_send_size, + le32_to_cpu(req->max_receive_size)); + st->max_fragmented_send_size = + le32_to_cpu(req->max_fragmented_size); + st->max_fragmented_recv_size = + (st->recv_credit_max * st->max_recv_size) / 2; + + ret = smb_direct_send_negotiate_response(st, ret); +out: + spin_lock_irq(&st->reassembly_queue_lock); + st->reassembly_queue_length--; + list_del(&recvmsg->list); + spin_unlock_irq(&st->reassembly_queue_lock); + put_recvmsg(st, recvmsg); + + return ret; +} + +static int smb_direct_connect(struct smb_direct_transport *st) +{ int ret; struct ib_qp_cap qp_cap; @@ -1898,13 +1955,11 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return ret; } - ret = smb_direct_negotiate(st); + ret = smb_direct_prepare_negotiation(st); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; } - - st->status = SMB_DIRECT_CS_CONNECTED; return 0; } @@ -1920,6 +1975,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { ksmbd_debug(RDMA, @@ -1932,18 +1988,23 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (!t) return -ENOMEM; + ret = smb_direct_connect(t); + if (ret) + goto out_err; + KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", - SMB_DIRECT_PORT); + smb_direct_port); if (IS_ERR(KSMBD_TRANS(t)->handler)) { - int ret = PTR_ERR(KSMBD_TRANS(t)->handler); - + ret = PTR_ERR(KSMBD_TRANS(t)->handler); pr_err("Can't start thread\n"); - free_transport(t); - return ret; + goto out_err; } return 0; +out_err: + free_transport(t); + return ret; } static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, @@ -2007,12 +2068,65 @@ err: return ret; } +static int smb_direct_ib_client_add(struct ib_device *ib_dev) +{ + struct smb_direct_device *smb_dev; + + /* Set 5445 port if device type is iWARP(No IB) */ + if (ib_dev->node_type != RDMA_NODE_IB_CA) + smb_direct_port = SMB_DIRECT_PORT_IWARP; + + if (!ib_dev->ops.get_netdev || + !rdma_frwr_is_supported(&ib_dev->attrs)) + return 0; + + smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL); + if (!smb_dev) + return -ENOMEM; + smb_dev->ib_dev = ib_dev; + + write_lock(&smb_direct_device_lock); + list_add(&smb_dev->list, &smb_direct_device_list); + write_unlock(&smb_direct_device_lock); + + ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); + return 0; +} + +static void smb_direct_ib_client_remove(struct ib_device *ib_dev, + void *client_data) +{ + struct smb_direct_device *smb_dev, *tmp; + + write_lock(&smb_direct_device_lock); + list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { + if (smb_dev->ib_dev == ib_dev) { + list_del(&smb_dev->list); + kfree(smb_dev); + break; + } + } + write_unlock(&smb_direct_device_lock); +} + +static struct ib_client smb_direct_ib_client = { + .name = "ksmbd_smb_direct_ib", + .add = smb_direct_ib_client_add, + .remove = smb_direct_ib_client_remove, +}; + int ksmbd_rdma_init(void) { int ret; smb_direct_listener.cm_id = NULL; + ret = ib_register_client(&smb_direct_ib_client); + if (ret) { + pr_err("failed to ib_register_client\n"); + return ret; + } + /* When a client is running out of send credits, the credits are * granted by the server's sending a packet using this queue. * This avoids the situation that a clients cannot send packets @@ -2023,7 +2137,7 @@ int ksmbd_rdma_init(void) if (!smb_direct_wq) return -ENOMEM; - ret = smb_direct_listen(SMB_DIRECT_PORT); + ret = smb_direct_listen(smb_direct_port); if (ret) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; @@ -2036,36 +2150,67 @@ int ksmbd_rdma_init(void) return 0; } -int ksmbd_rdma_destroy(void) +void ksmbd_rdma_destroy(void) { - if (smb_direct_listener.cm_id) - rdma_destroy_id(smb_direct_listener.cm_id); + if (!smb_direct_listener.cm_id) + return; + + ib_unregister_client(&smb_direct_ib_client); + rdma_destroy_id(smb_direct_listener.cm_id); + smb_direct_listener.cm_id = NULL; if (smb_direct_wq) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; } - return 0; } bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { - struct ib_device *ibdev; + struct smb_direct_device *smb_dev; + int i; bool rdma_capable = false; - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); - if (ibdev) { - if (rdma_frwr_is_supported(&ibdev->attrs)) - rdma_capable = true; - ib_device_put(ibdev); + read_lock(&smb_direct_device_lock); + list_for_each_entry(smb_dev, &smb_direct_device_list, list) { + for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { + struct net_device *ndev; + + ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev, + i + 1); + if (!ndev) + continue; + + if (ndev == netdev) { + dev_put(ndev); + rdma_capable = true; + goto out; + } + dev_put(ndev); + } + } +out: + read_unlock(&smb_direct_device_lock); + + if (rdma_capable == false) { + struct ib_device *ibdev; + + ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); + if (ibdev) { + if (rdma_frwr_is_supported(&ibdev->attrs)) + rdma_capable = true; + ib_device_put(ibdev); + } } + return rdma_capable; } static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, + .shutdown = smb_direct_shutdown, .writev = smb_direct_writev, .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 0fa8adc0776f..5567d93a6f96 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,8 +7,6 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ -#define SMB_DIRECT_PORT 5445 - /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; @@ -52,7 +50,7 @@ struct smb_direct_data_transfer { #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); -int ksmbd_rdma_destroy(void); +void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); #else static inline int ksmbd_rdma_init(void) { return 0; } diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index c14320e03b69..82a1429bbe12 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -404,7 +404,7 @@ static int create_socket(struct interface *iface) &ksmbd_socket); if (ret) { pr_err("Can't create socket for ipv4: %d\n", ret); - goto out_error; + goto out_clear; } sin.sin_family = PF_INET; @@ -462,6 +462,7 @@ static int create_socket(struct interface *iface) out_error: tcp_destroy_socket(ksmbd_socket); +out_clear: iface->ksmbd_socket = NULL; return ret; } diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index adf94a4f22fa..8c37aaf936ab 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -47,6 +47,7 @@ struct ksmbd_dir_info { int last_entry_offset; bool hide_dot_file; int flags; + int last_entry_off_align; }; struct ksmbd_readdir_data { diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 448576fbe4b7..36239ce31afd 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -96,16 +96,6 @@ struct ksmbd_file { int durable_timeout; - /* for SMB1 */ - int pid; - - /* conflict lock fail count for SMB1 */ - unsigned int cflock_cnt; - /* last lock failure start offset for SMB1 */ - unsigned long long llock_fstart; - - int dirent_offset; - /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; diff --git a/fs/libfs.c b/fs/libfs.c index ba7438ab9371..974125270a42 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1379,7 +1379,7 @@ bool is_empty_dir_inode(struct inode *inode) (inode->i_op == &empty_dir_inode_operations); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Determine if the name of a dentry should be casefolded. * @@ -1473,7 +1473,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) static const struct dentry_operations generic_encrypted_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, @@ -1508,10 +1508,10 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #ifdef CONFIG_FS_ENCRYPTION bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) bool needs_ci_ops = dentry->d_sb->s_encoding; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) if (needs_encrypt_ops && needs_ci_ops) { d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); return; @@ -1523,7 +1523,7 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) return; } #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (needs_ci_ops) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b220e1b91726..0475c5a5d061 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -54,13 +54,9 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; -static struct task_struct *nlmsvc_task; -static struct svc_rqst *nlmsvc_rqst; +static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; -static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); - unsigned int lockd_net_id; /* @@ -184,7 +180,12 @@ lockd(void *vrqstp) nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - return 0; + + dprintk("lockd_down: service stopped\n"); + + svc_exit_thread(rqstp); + + module_put_and_kthread_exit(0); } static int create_lockd_listener(struct svc_serv *serv, const char *name, @@ -290,8 +291,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) __func__, net->ns.inum); } } else { - pr_err("%s: no users! task=%p, net=%x\n", - __func__, nlmsvc_task, net->ns.inum); + pr_err("%s: no users! net=%x\n", + __func__, net->ns.inum); BUG(); } } @@ -302,20 +303,16 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nlm_ntf_refcnt)) + if (event != NETDEV_DOWN) goto out; - if (nlmsvc_rqst) { + if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; - svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, - (struct sockaddr *)&sin); + svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } - atomic_dec(&nlm_ntf_refcnt); - wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -332,21 +329,17 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nlm_ntf_refcnt)) + if (event != NETDEV_DOWN) goto out; - if (nlmsvc_rqst) { + if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; - svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, - (struct sockaddr *)&sin6); + svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nlm_ntf_refcnt); - wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -357,86 +350,22 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static void lockd_unregister_notifiers(void) -{ - unregister_inetaddr_notifier(&lockd_inetaddr_notifier); -#if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&lockd_inet6addr_notifier); -#endif - wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); -} - -static void lockd_svc_exit_thread(void) -{ - atomic_dec(&nlm_ntf_refcnt); - lockd_unregister_notifiers(); - svc_exit_thread(nlmsvc_rqst); -} - -static int lockd_start_svc(struct svc_serv *serv) -{ - int error; - - if (nlmsvc_rqst) - return 0; - - /* - * Create the kernel thread and wait for it to start. - */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(nlmsvc_rqst)) { - error = PTR_ERR(nlmsvc_rqst); - printk(KERN_WARNING - "lockd_up: svc_rqst allocation failed, error=%d\n", - error); - lockd_unregister_notifiers(); - goto out_rqst; - } - - atomic_inc(&nlm_ntf_refcnt); - svc_sock_update_bufs(serv); - serv->sv_maxconn = nlm_max_connections; - - nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name); - if (IS_ERR(nlmsvc_task)) { - error = PTR_ERR(nlmsvc_task); - printk(KERN_WARNING - "lockd_up: kthread_run failed, error=%d\n", error); - goto out_task; - } - nlmsvc_rqst->rq_task = nlmsvc_task; - wake_up_process(nlmsvc_task); - - dprintk("lockd_up: service started\n"); - return 0; - -out_task: - lockd_svc_exit_thread(); - nlmsvc_task = NULL; -out_rqst: - nlmsvc_rqst = NULL; - return error; -} - static const struct svc_serv_ops lockd_sv_ops = { .svo_shutdown = svc_rpcb_cleanup, + .svo_function = lockd, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_module = THIS_MODULE, }; -static struct svc_serv *lockd_create_svc(void) +static int lockd_get(void) { struct svc_serv *serv; + int error; - /* - * Check whether we're already up and running. - */ - if (nlmsvc_rqst) { - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ - svc_get(nlmsvc_rqst->rq_server); - return nlmsvc_rqst->rq_server; + if (nlmsvc_serv) { + svc_get(nlmsvc_serv); + nlmsvc_users++; + return 0; } /* @@ -454,14 +383,41 @@ static struct svc_serv *lockd_create_svc(void) serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } + + serv->sv_maxconn = nlm_max_connections; + error = svc_set_num_threads(serv, NULL, 1); + /* The thread now holds the only reference */ + svc_put(serv); + if (error < 0) + return error; + + nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); - return serv; + nlmsvc_users++; + return 0; +} + +static void lockd_put(void) +{ + if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) + return; + if (--nlmsvc_users) + return; + + unregister_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif + + svc_set_num_threads(nlmsvc_serv, NULL, 0); + nlmsvc_serv = NULL; + dprintk("lockd_down: service destroyed\n"); } /* @@ -469,36 +425,21 @@ static struct svc_serv *lockd_create_svc(void) */ int lockd_up(struct net *net, const struct cred *cred) { - struct svc_serv *serv; int error; mutex_lock(&nlmsvc_mutex); - serv = lockd_create_svc(); - if (IS_ERR(serv)) { - error = PTR_ERR(serv); - goto err_create; - } + error = lockd_get(); + if (error) + goto err; - error = lockd_up_net(serv, net, cred); + error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { - lockd_unregister_notifiers(); - goto err_put; + lockd_put(); + goto err; } - error = lockd_start_svc(serv); - if (error < 0) { - lockd_down_net(serv, net); - goto err_put; - } - nlmsvc_users++; - /* - * Note: svc_serv structures have an initial use count of 1, - * so we exit through here on both success and failure. - */ -err_put: - svc_destroy(serv); -err_create: +err: mutex_unlock(&nlmsvc_mutex); return error; } @@ -511,27 +452,8 @@ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); - lockd_down_net(nlmsvc_rqst->rq_server, net); - if (nlmsvc_users) { - if (--nlmsvc_users) - goto out; - } else { - printk(KERN_ERR "lockd_down: no users! task=%p\n", - nlmsvc_task); - BUG(); - } - - if (!nlmsvc_task) { - printk(KERN_ERR "lockd_down: no lockd running.\n"); - BUG(); - } - kthread_stop(nlmsvc_task); - dprintk("lockd_down: service stopped\n"); - lockd_svc_exit_thread(); - dprintk("lockd_down: service destroyed\n"); - nlmsvc_task = NULL; - nlmsvc_rqst = NULL; -out: + lockd_down_net(nlmsvc_serv, net); + lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e9b85d8fd5fe..cb3658ab9b7a 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -470,8 +470,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct nlm_block *block = NULL; +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct inode *inode = nlmsvc_file_inode(file); +#endif + struct nlm_block *block = NULL; int error; int mode; int async_block = 0; @@ -484,7 +486,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + if (nlmsvc_file_file(file)->f_op->lock) { async_block = wait; wait = 0; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index cb3a7512c33e..0a22a2faf552 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -179,19 +179,21 @@ nlm_delete_file(struct nlm_file *file) static int nlm_unlock_files(struct nlm_file *file) { struct file_lock lock; - struct file *f; + locks_init_lock(&lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; - for (f = file->f_file[0]; f <= file->f_file[1]; f++) { - if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { - pr_warn("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); - return 1; - } - } + if (file->f_file[O_RDONLY] && + vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) + goto out_err; + if (file->f_file[O_WRONLY] && + vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL)) + goto out_err; return 0; +out_err: + pr_warn("lockd: unlock failure in %s:%d\n", __FILE__, __LINE__); + return 1; } /* diff --git a/fs/locks.c b/fs/locks.c index 0fca9d680978..8c6df10cd9ed 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -62,6 +62,7 @@ #include <linux/pid_namespace.h> #include <linux/hashtable.h> #include <linux/percpu.h> +#include <linux/sysctl.h> #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> @@ -88,8 +89,37 @@ static int target_leasetype(struct file_lock *fl) return fl->fl_type; } -int leases_enable = 1; -int lease_break_time = 45; +static int leases_enable = 1; +static int lease_break_time = 45; + +#ifdef CONFIG_SYSCTL +static struct ctl_table locks_sysctls[] = { + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_MMU + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_MMU */ + {} +}; + +static int __init init_fs_locks_sysctls(void) +{ + register_sysctl_init("fs", locks_sysctls); + return 0; +} +early_initcall(init_fs_locks_sysctls); +#endif /* CONFIG_SYSCTL */ /* * The global file_lock_list is only used for displaying /proc/locks, so we diff --git a/fs/mpage.c b/fs/mpage.c index 334e7d09aa65..87f5cfef6caa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -29,7 +29,6 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> -#include <linux/cleancache.h> #include "internal.h" /* @@ -284,12 +283,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && - cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } - /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/fs/namei.c b/fs/namei.c index 1f9d2187c765..3f1829b3ab5b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1020,10 +1020,60 @@ static inline void put_link(struct nameidata *nd) path_put(&last->link); } -int sysctl_protected_symlinks __read_mostly = 0; -int sysctl_protected_hardlinks __read_mostly = 0; -int sysctl_protected_fifos __read_mostly; -int sysctl_protected_regular __read_mostly; +static int sysctl_protected_symlinks __read_mostly; +static int sysctl_protected_hardlinks __read_mostly; +static int sysctl_protected_fifos __read_mostly; +static int sysctl_protected_regular __read_mostly; + +#ifdef CONFIG_SYSCTL +static struct ctl_table namei_sysctls[] = { + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +static int __init init_fs_namei_sysctls(void) +{ + register_sysctl_init("fs", namei_sysctls); + return 0; +} +fs_initcall(init_fs_namei_sysctls); + +#endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations @@ -3958,7 +4008,8 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, inode_lock(dentry->d_inode); error = -EBUSY; - if (is_local_mountpoint(dentry)) + if (is_local_mountpoint(dentry) || + (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); @@ -3973,13 +4024,12 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); - fsnotify_rmdir(dir, dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) - d_delete(dentry); + d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); @@ -4101,7 +4151,6 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, if (!error) { dont_mount(dentry); detach_mounts(dentry); - fsnotify_unlink(dir, dentry); } } } @@ -4109,9 +4158,11 @@ out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ - if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { + if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { + fsnotify_unlink(dir, dentry); + } else if (!error) { fsnotify_link_count(target); - d_delete(dentry); + d_delete_notify(dir, dentry); } return error; diff --git a/fs/namespace.c b/fs/namespace.c index b696543adab8..de6fae84f1a1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -31,12 +31,13 @@ #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> +#include <linux/mnt_idmapping.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ -unsigned int sysctl_mount_max __read_mostly = 100000; +static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; @@ -468,6 +469,24 @@ void mnt_drop_write_file(struct file *file) } EXPORT_SYMBOL(mnt_drop_write_file); +/** + * mnt_hold_writers - prevent write access to the given mount + * @mnt: mnt to prevent write access to + * + * Prevents write access to @mnt if there are no active writers for @mnt. + * This function needs to be called and return successfully before changing + * properties of @mnt that need to remain stable for callers with write access + * to @mnt. + * + * After this functions has been called successfully callers must pair it with + * a call to mnt_unhold_writers() in order to stop preventing write access to + * @mnt. + * + * Context: This function expects lock_mount_hash() to be held serializing + * setting MNT_WRITE_HOLD. + * Return: On success 0 is returned. + * On error, -EBUSY is returned. + */ static inline int mnt_hold_writers(struct mount *mnt) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; @@ -499,6 +518,18 @@ static inline int mnt_hold_writers(struct mount *mnt) return 0; } +/** + * mnt_unhold_writers - stop preventing write access to the given mount + * @mnt: mnt to stop preventing write access to + * + * Stop preventing write access to @mnt allowing callers to gain write access + * to @mnt again. + * + * This function can only be called after a successful call to + * mnt_hold_writers(). + * + * Context: This function expects lock_mount_hash() to be held. + */ static inline void mnt_unhold_writers(struct mount *mnt) { /* @@ -561,7 +592,7 @@ static void free_vfsmnt(struct mount *mnt) struct user_namespace *mnt_userns; mnt_userns = mnt_user_ns(&mnt->mnt); - if (mnt_userns != &init_user_ns) + if (!initial_idmapping(mnt_userns)) put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP @@ -965,6 +996,7 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; + struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -982,6 +1014,10 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; + fs_userns = mnt->mnt.mnt_sb->s_user_ns; + if (!initial_idmapping(fs_userns)) + mnt->mnt.mnt_userns = get_user_ns(fs_userns); + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1072,7 +1108,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, atomic_inc(&sb->s_active); mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); - if (mnt->mnt.mnt_userns != &init_user_ns) + if (!initial_idmapping(mnt->mnt.mnt_userns)) mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -3927,28 +3963,32 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct vfsmount *m = &mnt->mnt; + struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; if (!kattr->mnt_userns) return 0; /* + * Creating an idmapped mount with the filesystem wide idmapping + * doesn't make sense so block that. We don't allow mushy semantics. + */ + if (kattr->mnt_userns == fs_userns) + return -EINVAL; + + /* * Once a mount has been idmapped we don't allow it to change its * mapping. It makes things simpler and callers can just create * another bind-mount they can idmap if they want to. */ - if (mnt_user_ns(m) != &init_user_ns) + if (is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; - /* Don't yet support filesystem mountable in user namespaces. */ - if (m->mnt_sb->s_user_ns != &init_user_ns) - return -EINVAL; - /* We're not controlling the superblock. */ - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ @@ -4002,14 +4042,27 @@ out: static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { - struct user_namespace *mnt_userns; + struct user_namespace *mnt_userns, *old_mnt_userns; if (!kattr->mnt_userns) return; + /* + * We're the only ones able to change the mount's idmapping. So + * mnt->mnt.mnt_userns is stable and we can retrieve it directly. + */ + old_mnt_userns = mnt->mnt.mnt_userns; + mnt_userns = get_user_ns(kattr->mnt_userns); /* Pairs with smp_load_acquire() in mnt_user_ns(). */ smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); + + /* + * If this is an idmapped filesystem drop the reference we've taken + * in vfs_create_mount() before. + */ + if (!initial_idmapping(old_mnt_userns)) + put_user_ns(old_mnt_userns); } static void mount_setattr_commit(struct mount_kattr *kattr, @@ -4133,13 +4186,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } /* - * The init_user_ns is used to indicate that a vfsmount is not idmapped. - * This is simpler than just having to treat NULL as unmapped. Users - * wanting to idmap a mount to init_user_ns can just use a namespace - * with an identity mapping. + * The initial idmapping cannot be used to create an idmapped + * mount. We use the initial idmapping as an indicator of a mount + * that is not idmapped. It can simply be passed into helpers that + * are aware of idmapped mounts as a convenient shortcut. A user + * can just create a dedicated identity mapping to achieve the same + * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (mnt_userns == &init_user_ns) { + if (initial_idmapping(mnt_userns)) { err = -EPERM; goto out_fput; } @@ -4595,3 +4650,25 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +#ifdef CONFIG_SYSCTL +static struct ctl_table fs_namespace_sysctls[] = { + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; + +static int __init init_fs_namespace_sysctls(void) +{ + register_sysctl_init("fs", fs_namespace_sysctls); + return 0; +} +fs_initcall(init_fs_namespace_sysctls); + +#endif /* CONFIG_SYSCTL */ diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 75c76cbb27cc..501da990c259 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -55,7 +55,8 @@ static struct netfs_read_request *netfs_alloc_read_request( INIT_WORK(&rreq->work, netfs_rreq_work); refcount_set(&rreq->usage, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - ops->init_rreq(rreq, file); + if (ops->init_rreq) + ops->init_rreq(rreq, file); netfs_stat(&netfs_n_rh_rreq); } @@ -170,7 +171,7 @@ static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error */ static void netfs_read_from_cache(struct netfs_read_request *rreq, struct netfs_read_subrequest *subreq, - bool seek_data) + enum netfs_read_from_hole read_hole) { struct netfs_cache_resources *cres = &rreq->cache_resources; struct iov_iter iter; @@ -180,7 +181,7 @@ static void netfs_read_from_cache(struct netfs_read_request *rreq, subreq->start + subreq->transferred, subreq->len - subreq->transferred); - cres->ops->read(cres, subreq->start, &iter, seek_data, + cres->ops->read(cres, subreq->start, &iter, read_hole, netfs_cache_read_terminated, subreq); } @@ -323,7 +324,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq) } ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size); + rreq->i_size, true); if (ret < 0) { trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); @@ -461,7 +462,7 @@ static void netfs_rreq_short_read(struct netfs_read_request *rreq, netfs_get_read_subrequest(subreq); atomic_inc(&rreq->nr_rd_ops); if (subreq->source == NETFS_READ_FROM_CACHE) - netfs_read_from_cache(rreq, subreq, true); + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); else netfs_read_from_server(rreq, subreq); } @@ -789,7 +790,7 @@ static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq, netfs_read_from_server(rreq, subreq); break; case NETFS_READ_FROM_CACHE: - netfs_read_from_cache(rreq, subreq, false); + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); break; default: BUG(); diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 22d11fdc6deb..5f6db37f461e 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -12,7 +12,7 @@ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ export.o sysfs.o fs_context.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o -nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o obj-$(CONFIG_NFS_V2) += nfsv2.o nfsv2-y := nfs2super.o proc.o nfs2xdr.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86d856de1389..054cc1255fac 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -93,7 +93,7 @@ nfs4_callback_svc(void *vrqstp) svc_process(rqstp); } svc_exit_thread(rqstp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -137,7 +137,7 @@ nfs41_callback_svc(void *vrqstp) } } svc_exit_thread(rqstp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -169,12 +169,12 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - if (serv->sv_nrthreads-1 == nrservs) + if (serv->sv_nrthreads == nrservs) return 0; - ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + ret = svc_set_num_threads(serv, NULL, nrservs); if (ret) { - serv->sv_ops->svo_setup(serv, NULL, 0); + svc_set_num_threads(serv, NULL, 0); return ret; } dprintk("nfs_callback_up: service started\n"); @@ -235,14 +235,12 @@ err_bind: static const struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static const struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -266,14 +264,8 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) /* * Check whether we're already up and running. */ - if (cb_info->serv) { - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ - svc_get(cb_info->serv); - return cb_info->serv; - } + if (cb_info->serv) + return svc_get(cb_info->serv); switch (minorversion) { case 0: @@ -294,7 +286,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); @@ -335,16 +327,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto err_start; cb_info->users++; - /* - * svc_create creates the svc_serv with sv_nrthreads == 1, and then - * svc_prepare_thread increments that. So we need to call svc_destroy - * on both success and failure so that the refcount is 1 when the - * thread exits. - */ err_net: if (!cb_info->users) cb_info->serv = NULL; - svc_destroy(serv); + svc_put(serv); err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -369,8 +355,8 @@ void nfs_callback_down(int minorversion, struct net *net) cb_info->users--; if (cb_info->users == 0) { svc_get(serv); - serv->sv_ops->svo_setup(serv, NULL, 0); - svc_destroy(serv); + svc_set_num_threads(serv, NULL, 0); + svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 6a2033131c06..ccd4f245cae2 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -170,7 +170,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 09c5b1cb3e07..c343666d9a42 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -358,7 +358,7 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + uint32_t i; __be32 res = 0; struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a67c41ec545f..f90de8043b0f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -258,11 +258,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ p = xdr_inline_decode(xdr, sizeof(uint32_t)); @@ -271,7 +269,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } n = ntohl(*p++); - if (n <= 0) + if (n == 0) goto out; if (n > ULONG_MAX / sizeof(*args->devs)) { status = htonl(NFS4ERR_BADXDR); @@ -330,19 +328,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e4dc1ab9312..d1f34229e11a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -177,14 +177,13 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; - nfs_fscache_get_client_cookie(clp); - return clp; error_cleanup: @@ -238,8 +237,6 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - nfs_fscache_release_client_cookie(clp); - /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); @@ -427,7 +424,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, cl_init); } @@ -860,6 +856,13 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str server->namelen = pathinfo.max_namelen; } + if (clp->rpc_ops->discover_trunking != NULL && + (server->caps & NFS_CAP_FS_LOCATIONS)) { + error = clp->rpc_ops->discover_trunking(server, mntfh); + if (error < 0) + return error; + } + return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 731d31015b6a..75cb1cbe4cde 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -18,6 +18,7 @@ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> @@ -79,6 +80,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->page_index = 0; + ctx->eof = false; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -167,6 +169,7 @@ struct nfs_readdir_descriptor { unsigned int cache_entry_index; signed char duped; bool plus; + bool eob; bool eof; }; @@ -866,7 +869,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, status = nfs_readdir_page_filler(desc, entry, pages, pglen, arrays, narrays); - } while (!status && nfs_readdir_page_needs_filling(page)); + } while (!status && nfs_readdir_page_needs_filling(page) && + page_mapping(page)); nfs_readdir_free_pages(pages, array_size); out: @@ -987,7 +991,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = true; + desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); @@ -1003,7 +1007,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->duped = 1; } if (array->page_is_eof) - desc->eof = true; + desc->eof = !desc->eob; kunmap(desc->page); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", @@ -1040,12 +1044,13 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) goto out; desc->page_index = 0; + desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->duped = 0; status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); - for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->page = arrays[i]; nfs_do_filldir(desc, verf); } @@ -1104,9 +1109,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->duped = dir_ctx->duped; page_index = dir_ctx->page_index; desc->attr_gencount = dir_ctx->attr_gencount; + desc->eof = dir_ctx->eof; memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); spin_unlock(&file->f_lock); + if (desc->eof) { + res = 0; + goto out_free; + } + if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && list_is_singular(&nfsi->open_files)) invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); @@ -1140,7 +1151,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); - } while (!desc->eof); + } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; @@ -1148,9 +1159,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->eof = desc->eof; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); - +out_free: kfree(desc); out: @@ -1192,6 +1204,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset == 0) memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; + dir_ctx->eof = false; } spin_unlock(&filp->f_lock); return offset; @@ -1324,6 +1337,14 @@ void nfs_clear_verifier_delegated(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ +static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) && + d_really_is_negative(dentry)) + return dentry->d_time == inode_peek_iversion_raw(dir); + return nfs_verify_change_attribute(dir, dentry->d_time); +} + /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy @@ -1337,7 +1358,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) return 0; - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { @@ -1346,7 +1367,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) return 0; } - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; return 1; } @@ -1436,6 +1457,9 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; + /* Case insensitive server? Revalidate negative dentries */ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return 1; return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } @@ -1536,7 +1560,7 @@ out: * If the lookup failed despite the dentry change attribute being * a match, then we should revalidate the directory cache. */ - if (!ret && nfs_verify_change_attribute(dir, dentry->d_time)) + if (!ret && nfs_dentry_verify_change(dir, dentry)) nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } @@ -1775,8 +1799,11 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (error == -ENOENT) + if (error == -ENOENT) { + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); goto no_entry; + } if (error < 0) { res = ERR_PTR(error); goto out; @@ -1805,6 +1832,14 @@ out: } EXPORT_SYMBOL_GPL(nfs_lookup); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode) +{ + /* Case insensitive server? Revalidate dentries */ + if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) + d_prune_aliases(inode); +} +EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); + #if IS_ENABLED(CONFIG_NFS_V4) static int nfs4_lookup_revalidate(struct dentry *, unsigned int); @@ -1866,6 +1901,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + unsigned long dir_verifier; bool switched = false; int created = 0; int err; @@ -1939,7 +1975,11 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, switch (err) { case -ENOENT: d_splice_alias(NULL, dentry); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); + else + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); break; case -EISDIR: case -ENOTDIR: @@ -1967,6 +2007,24 @@ out: no_open: res = nfs_lookup(dir, dentry, lookup_flags); + if (!res) { + inode = d_inode(dentry); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) + res = ERR_PTR(-ENOTDIR); + else if (inode && S_ISREG(inode->i_mode)) + res = ERR_PTR(-EOPENSTALE); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { + dput(res); + res = ERR_PTR(-ENOTDIR); + } else if (inode && S_ISREG(inode->i_mode)) { + dput(res); + res = ERR_PTR(-EOPENSTALE); + } + } if (switched) { d_lookup_done(dentry); if (!res) @@ -2185,8 +2243,10 @@ static void nfs_dentry_remove_handle_error(struct inode *dir, switch (error) { case -ENOENT: d_delete(dentry); - fallthrough; + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + break; case 0: + nfs_d_prune_case_insensitive_aliases(d_inode(dentry)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } } @@ -2379,6 +2439,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -2468,6 +2530,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } } + if (S_ISREG(old_inode->i_mode)) + nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); @@ -2528,7 +2592,7 @@ MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache lengt static void nfs_access_free_entry(struct nfs_access_entry *entry) { - put_cred(entry->cred); + put_group_info(entry->group_info); kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); @@ -2654,6 +2718,43 @@ void nfs_access_zap_cache(struct inode *inode) } EXPORT_SYMBOL_GPL(nfs_access_zap_cache); +static int access_cmp(const struct cred *a, const struct nfs_access_entry *b) +{ + struct group_info *ga, *gb; + int g; + + if (uid_lt(a->fsuid, b->fsuid)) + return -1; + if (uid_gt(a->fsuid, b->fsuid)) + return 1; + + if (gid_lt(a->fsgid, b->fsgid)) + return -1; + if (gid_gt(a->fsgid, b->fsgid)) + return 1; + + ga = a->group_info; + gb = b->group_info; + if (ga == gb) + return 0; + if (ga == NULL) + return -1; + if (gb == NULL) + return 1; + if (ga->ngroups < gb->ngroups) + return -1; + if (ga->ngroups > gb->ngroups) + return 1; + + for (g = 0; g < ga->ngroups; g++) { + if (gid_lt(ga->gid[g], gb->gid[g])) + return -1; + if (gid_gt(ga->gid[g], gb->gid[g])) + return 1; + } + return 0; +} + static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred) { struct rb_node *n = NFS_I(inode)->access_cache.rb_node; @@ -2661,7 +2762,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co while (n != NULL) { struct nfs_access_entry *entry = rb_entry(n, struct nfs_access_entry, rb_node); - int cmp = cred_fscmp(cred, entry->cred); + int cmp = access_cmp(cred, entry); if (cmp < 0) n = n->rb_left; @@ -2673,7 +2774,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co return NULL; } -static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block) +static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; @@ -2703,8 +2804,7 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred * spin_lock(&inode->i_lock); retry = false; } - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; out: @@ -2716,7 +2816,7 @@ out_zap: return -ENOENT; } -static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask) { /* Only check the most recently returned cache entry, * but do it without locking. @@ -2732,35 +2832,36 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || - cred_fscmp(cred, cache->cred) != 0) + access_cmp(cred, cache) != 0) cache = NULL; if (cache == NULL) goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; err = 0; out: rcu_read_unlock(); return err; } -int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct -nfs_access_entry *res, bool may_block) +int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block) { int status; - status = nfs_access_get_cached_rcu(inode, cred, res); + status = nfs_access_get_cached_rcu(inode, cred, mask); if (status != 0) - status = nfs_access_get_cached_locked(inode, cred, res, + status = nfs_access_get_cached_locked(inode, cred, mask, may_block); return status; } EXPORT_SYMBOL_GPL(nfs_access_get_cached); -static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +static void nfs_access_add_rbtree(struct inode *inode, + struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; @@ -2773,7 +2874,7 @@ static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry * while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); - cmp = cred_fscmp(set->cred, entry->cred); + cmp = access_cmp(cred, entry); if (cmp < 0) p = &parent->rb_left; @@ -2795,13 +2896,16 @@ found: nfs_access_free_entry(entry); } -void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->cred = get_cred(set->cred); + cache->fsuid = cred->fsuid; + cache->fsgid = cred->fsgid; + cache->group_info = get_group_info(cred->group_info); cache->mask = set->mask; /* The above field assignments must be visible @@ -2809,7 +2913,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) * use rcu_assign_pointer, so just force the memory barrier. */ smp_wmb(); - nfs_access_add_rbtree(inode, cache); + nfs_access_add_rbtree(inode, cache, cred); /* Update accounting */ smp_mb__before_atomic(); @@ -2874,7 +2978,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) trace_nfs_access_enter(inode); - status = nfs_access_get_cached(inode, cred, &cache, may_block); + status = nfs_access_get_cached(inode, cred, &cache.mask, may_block); if (status == 0) goto out_cached; @@ -2894,8 +2998,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else cache.mask |= NFS_ACCESS_EXECUTE; - cache.cred = cred; - status = NFS_PROTO(inode)->access(inode, &cache); + status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { if (!S_ISDIR(inode->i_mode)) @@ -2905,7 +3008,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) } goto out; } - nfs_access_add_cache(inode, &cache); + nfs_access_add_cache(inode, &cache, cred); out_cached: cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9cff8709c80a..eabfdab543c8 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -59,6 +59,7 @@ #include "internal.h" #include "iostat.h" #include "pnfs.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -959,6 +960,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) } else { result = requested; } + nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE); out_release: nfs_direct_req_release(dreq); out: diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 171c424cb6d5..01596f2d0a1e 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -158,5 +158,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, + EXPORT_OP_NOATOMIC_ATTR, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 24e7dccce355..76d76acbc594 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -84,6 +84,7 @@ nfs_file_release(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSRELEASE); nfs_file_clear_open_context(filp); + nfs_fscache_release_file(inode, filp); return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); @@ -415,8 +416,7 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page_file_mapping(page)->host, page); - - nfs_fscache_invalidate_page(page, page->mapping->host); + wait_on_page_fscache(page); } /* @@ -475,12 +475,11 @@ static void nfs_check_dirty_writeback(struct page *page, static int nfs_launder_page(struct page *page) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", inode->i_ino, (long long)page_offset(page)); - nfs_fscache_wait_on_page_write(nfsi, page); + wait_on_page_fscache(page); return nfs_wb_page(inode, page); } @@ -555,7 +554,11 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - nfs_fscache_wait_on_page_write(NFS_I(inode), page); + if (PageFsCache(page) && + wait_on_page_fscache_killable(vmf->page) < 0) { + ret = VM_FAULT_RETRY; + goto out; + } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 79323b5dab0c..aed0748fd6ec 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr { u32 stripe_count; u8 *stripe_indices; u32 ds_num; - struct nfs4_pnfs_ds *ds_list[1]; + struct nfs4_pnfs_ds *ds_list[]; }; struct nfs4_filelayout_segment { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 86c3f7e69ec4..acf4b88889dc 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -136,9 +136,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_stripe_indices; } - dsaddr = kzalloc(sizeof(*dsaddr) + - (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), - gfp_flags); + dsaddr = kzalloc(struct_size(dsaddr, ds_list, num), gfp_flags); if (!dsaddr) goto out_err_free_stripe_indices; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 0d444a90f513..ea17fa1f31ec 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -10,6 +10,7 @@ * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com> */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c deleted file mode 100644 index 573b1da9342c..000000000000 --- a/fs/nfs/fscache-index.c +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* NFS FS-Cache index structure definition - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/nfs_fs.h> -#include <linux/nfs_fs_sb.h> -#include <linux/in6.h> -#include <linux/iversion.h> - -#include "internal.h" -#include "fscache.h" - -#define NFSDBG_FACILITY NFSDBG_FSCACHE - -/* - * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks - * the cookie for the top-level index object for NFS into here. The top-level - * index can than have other cache objects inserted into it. - */ -struct fscache_netfs nfs_fscache_netfs = { - .name = "nfs", - .version = 0, -}; - -/* - * Register NFS for caching - */ -int nfs_fscache_register(void) -{ - return fscache_register_netfs(&nfs_fscache_netfs); -} - -/* - * Unregister NFS for caching - */ -void nfs_fscache_unregister(void) -{ - fscache_unregister_netfs(&nfs_fscache_netfs); -} - -/* - * Define the server object for FS-Cache. This is used to describe a server - * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and - * server address parameters. - */ -const struct fscache_cookie_def nfs_fscache_server_index_def = { - .name = "NFS.server", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -/* - * Define the superblock object for FS-Cache. This is used to describe a - * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS - * parameters that might cause a separate superblock. - */ -const struct fscache_cookie_def nfs_fscache_super_index_def = { - .name = "NFS.super", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -/* - * Consult the netfs about the state of an object - * - This function can be absent if the index carries no state data - * - The netfs data from the cookie being used as the target is - * presented, as is the auxiliary data - */ -static -enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * Get an extra reference on a read context. - * - This function can be absent if the completion function doesn't require a - * context. - * - The read context is passed back to NFS in the event that a data read on the - * cache fails with EIO - in which case the server must be contacted to - * retrieve the data, which requires the read context for security. - */ -static void nfs_fh_get_context(void *cookie_netfs_data, void *context) -{ - get_nfs_open_context(context); -} - -/* - * Release an extra reference on a read context. - * - This function can be absent if the completion function doesn't require a - * context. - */ -static void nfs_fh_put_context(void *cookie_netfs_data, void *context) -{ - if (context) - put_nfs_open_context(context); -} - -/* - * Define the inode object for FS-Cache. This is used to describe an inode - * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for - * an inode. - * - * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime - * held in the cache auxiliary data for the data storage object with those in - * the inode struct in memory. - */ -const struct fscache_cookie_def nfs_fscache_inode_object_def = { - .name = "NFS.fh", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = nfs_fscache_inode_check_aux, - .get_context = nfs_fh_get_context, - .put_context = nfs_fh_put_context, -}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index d743629e05e1..cfe901650ab0 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -22,24 +22,18 @@ #define NFSDBG_FACILITY NFSDBG_FSCACHE -static struct rb_root nfs_fscache_keys = RB_ROOT; -static DEFINE_SPINLOCK(nfs_fscache_keys_lock); +#define NFS_MAX_KEY_LEN 1000 -/* - * Layout of the key for an NFS server cache object. - */ -struct nfs_server_key { - struct { - uint16_t nfsversion; /* NFS protocol version */ - uint32_t minorversion; /* NFSv4 minor version */ - uint16_t family; /* address family */ - __be16 port; /* IP port */ - } hdr; - union { - struct in_addr ipv4_addr; /* IPv4 address */ - struct in6_addr ipv6_addr; /* IPv6 address */ - }; -} __packed; +static bool nfs_append_int(char *key, int *_len, unsigned long long x) +{ + if (*_len > NFS_MAX_KEY_LEN) + return false; + if (x == 0) + key[(*_len)++] = ','; + else + *_len += sprintf(key + *_len, ",%llx", x); + return true; +} /* * Get the per-client index cookie for an NFS client if the appropriate mount @@ -47,160 +41,108 @@ struct nfs_server_key { * - We always try and get an index cookie for the client, but get filehandle * cookies on a per-superblock basis, depending on the mount flags */ -void nfs_fscache_get_client_cookie(struct nfs_client *clp) +static bool nfs_fscache_get_client_key(struct nfs_client *clp, + char *key, int *_len) { const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; - struct nfs_server_key key; - uint16_t len = sizeof(key.hdr); - memset(&key, 0, sizeof(key)); - key.hdr.nfsversion = clp->rpc_ops->version; - key.hdr.minorversion = clp->cl_minorversion; - key.hdr.family = clp->cl_addr.ss_family; + *_len += snprintf(key + *_len, NFS_MAX_KEY_LEN - *_len, + ",%u.%u,%x", + clp->rpc_ops->version, + clp->cl_minorversion, + clp->cl_addr.ss_family); switch (clp->cl_addr.ss_family) { case AF_INET: - key.hdr.port = sin->sin_port; - key.ipv4_addr = sin->sin_addr; - len += sizeof(key.ipv4_addr); - break; + if (!nfs_append_int(key, _len, sin->sin_port) || + !nfs_append_int(key, _len, sin->sin_addr.s_addr)) + return false; + return true; case AF_INET6: - key.hdr.port = sin6->sin6_port; - key.ipv6_addr = sin6->sin6_addr; - len += sizeof(key.ipv6_addr); - break; + if (!nfs_append_int(key, _len, sin6->sin6_port) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[0]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[1]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[2]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[3])) + return false; + return true; default: printk(KERN_WARNING "NFS: Unknown network family '%d'\n", clp->cl_addr.ss_family); - clp->fscache = NULL; - return; + return false; } - - /* create a cache index for looking up filehandles */ - clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, - &nfs_fscache_server_index_def, - &key, len, - NULL, 0, - clp, 0, true); - dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", - clp, clp->fscache); -} - -/* - * Dispose of a per-client cookie - */ -void nfs_fscache_release_client_cookie(struct nfs_client *clp) -{ - dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", - clp, clp->fscache); - - fscache_relinquish_cookie(clp->fscache, NULL, false); - clp->fscache = NULL; } /* - * Get the cache cookie for an NFS superblock. We have to handle - * uniquification here because the cache doesn't do it for us. + * Get the cache cookie for an NFS superblock. * * The default uniquifier is just an empty string, but it may be overridden * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent * superblock across an automount point of some nature. */ -void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) +int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) { - struct nfs_fscache_key *key, *xkey; + struct fscache_volume *vcookie; struct nfs_server *nfss = NFS_SB(sb); - struct rb_node **p, *parent; - int diff; + unsigned int len = 3; + char *key; - nfss->fscache_key = NULL; - nfss->fscache = NULL; - if (!uniq) { - uniq = ""; - ulen = 1; + if (uniq) { + nfss->fscache_uniq = kmemdup_nul(uniq, ulen, GFP_KERNEL); + if (!nfss->fscache_uniq) + return -ENOMEM; } - key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + key = kmalloc(NFS_MAX_KEY_LEN + 24, GFP_KERNEL); if (!key) - return; - - key->nfs_client = nfss->nfs_client; - key->key.super.s_flags = sb->s_flags & NFS_SB_MASK; - key->key.nfs_server.flags = nfss->flags; - key->key.nfs_server.rsize = nfss->rsize; - key->key.nfs_server.wsize = nfss->wsize; - key->key.nfs_server.acregmin = nfss->acregmin; - key->key.nfs_server.acregmax = nfss->acregmax; - key->key.nfs_server.acdirmin = nfss->acdirmin; - key->key.nfs_server.acdirmax = nfss->acdirmax; - key->key.nfs_server.fsid = nfss->fsid; - key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; - - key->key.uniq_len = ulen; - memcpy(key->key.uniquifier, uniq, ulen); - - spin_lock(&nfs_fscache_keys_lock); - p = &nfs_fscache_keys.rb_node; - parent = NULL; - while (*p) { - parent = *p; - xkey = rb_entry(parent, struct nfs_fscache_key, node); - - if (key->nfs_client < xkey->nfs_client) - goto go_left; - if (key->nfs_client > xkey->nfs_client) - goto go_right; - - diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); - if (diff < 0) - goto go_left; - if (diff > 0) - goto go_right; - - if (key->key.uniq_len == 0) - goto non_unique; - diff = memcmp(key->key.uniquifier, - xkey->key.uniquifier, - key->key.uniq_len); - if (diff < 0) - goto go_left; - if (diff > 0) - goto go_right; - goto non_unique; - - go_left: - p = &(*p)->rb_left; - continue; - go_right: - p = &(*p)->rb_right; + return -ENOMEM; + + memcpy(key, "nfs", 3); + if (!nfs_fscache_get_client_key(nfss->nfs_client, key, &len) || + !nfs_append_int(key, &len, nfss->fsid.major) || + !nfs_append_int(key, &len, nfss->fsid.minor) || + !nfs_append_int(key, &len, sb->s_flags & NFS_SB_MASK) || + !nfs_append_int(key, &len, nfss->flags) || + !nfs_append_int(key, &len, nfss->rsize) || + !nfs_append_int(key, &len, nfss->wsize) || + !nfs_append_int(key, &len, nfss->acregmin) || + !nfs_append_int(key, &len, nfss->acregmax) || + !nfs_append_int(key, &len, nfss->acdirmin) || + !nfs_append_int(key, &len, nfss->acdirmax) || + !nfs_append_int(key, &len, nfss->client->cl_auth->au_flavor)) + goto out; + + if (ulen > 0) { + if (ulen > NFS_MAX_KEY_LEN - len) + goto out; + key[len++] = ','; + memcpy(key + len, uniq, ulen); + len += ulen; } - - rb_link_node(&key->node, parent, p); - rb_insert_color(&key->node, &nfs_fscache_keys); - spin_unlock(&nfs_fscache_keys_lock); - nfss->fscache_key = key; + key[len] = 0; /* create a cache index for looking up filehandles */ - nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, - &nfs_fscache_super_index_def, - &key->key, - sizeof(key->key) + ulen, - NULL, 0, - nfss, 0, true); + vcookie = fscache_acquire_volume(key, + NULL, /* preferred_cache */ + NULL, 0 /* coherency_data */); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", - nfss, nfss->fscache); - return; + nfss, vcookie); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(key); + return PTR_ERR(vcookie); + } + pr_err("NFS: Cache volume key already in use (%s)\n", key); + vcookie = NULL; + } + nfss->fscache = vcookie; -non_unique: - spin_unlock(&nfs_fscache_keys_lock); +out: kfree(key); - nfss->fscache_key = NULL; - nfss->fscache = NULL; - printk(KERN_WARNING "NFS:" - " Cache request denied due to non-unique superblock keys\n"); + return 0; } /* @@ -213,29 +155,9 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); - fscache_relinquish_cookie(nfss->fscache, NULL, false); + fscache_relinquish_volume(nfss->fscache, NULL, false); nfss->fscache = NULL; - - if (nfss->fscache_key) { - spin_lock(&nfs_fscache_keys_lock); - rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); - spin_unlock(&nfs_fscache_keys_lock); - kfree(nfss->fscache_key); - nfss->fscache_key = NULL; - } -} - -static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, - struct nfs_inode *nfsi) -{ - memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + kfree(nfss->fscache_uniq); } /* @@ -254,10 +176,12 @@ void nfs_fscache_init_inode(struct inode *inode) nfs_fscache_update_auxdata(&auxdata, nfsi); nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, - &nfs_fscache_inode_object_def, - nfsi->fh.data, nfsi->fh.size, - &auxdata, sizeof(auxdata), - nfsi, nfsi->vfs_inode.i_size, false); + 0, + nfsi->fh.data, /* index_key */ + nfsi->fh.size, + &auxdata, /* aux_data */ + sizeof(auxdata), + i_size_read(&nfsi->vfs_inode)); } /* @@ -265,24 +189,15 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { - struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - nfs_fscache_update_auxdata(&auxdata, nfsi); - fscache_relinquish_cookie(cookie, &auxdata, false); + fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } -static bool nfs_fscache_can_enable(void *data) -{ - struct inode *inode = data; - - return !inode_is_open_for_write(inode); -} - /* * Enable or disable caching for a file that is being opened as appropriate. * The cookie is allocated when the inode is initialised, but is not enabled at @@ -307,100 +222,104 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); + bool open_for_write = inode_is_open_for_write(inode); if (!fscache_cookie_valid(cookie)) return; - nfs_fscache_update_auxdata(&auxdata, nfsi); - - if (inode_is_open_for_write(inode)) { + fscache_use_cookie(cookie, open_for_write); + if (open_for_write) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); - clear_bit(NFS_INO_FSCACHE, &nfsi->flags); - fscache_disable_cookie(cookie, &auxdata, true); - fscache_uncache_all_inode_pages(cookie, inode); - } else { - dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); - fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size, - nfs_fscache_can_enable, inode); - if (fscache_cookie_enabled(cookie)) - set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + nfs_fscache_update_auxdata(&auxdata, nfsi); + fscache_invalidate(cookie, &auxdata, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); } } EXPORT_SYMBOL_GPL(nfs_fscache_open_file); -/* - * Release the caching state associated with a page, if the page isn't busy - * interacting with the cache. - * - Returns true (can release page) or false (page busy). - */ -int nfs_fscache_release_page(struct page *page, gfp_t gfp) +void nfs_fscache_release_file(struct inode *inode, struct file *filp) { - if (PageFsCache(page)) { - struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); - - BUG_ON(!cookie); - dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", - cookie, page, NFS_I(page->mapping->host)); - - if (!fscache_maybe_release_page(cookie, page, gfp)) - return 0; + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - nfs_inc_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); + if (fscache_cookie_valid(cookie)) { + nfs_fscache_update_auxdata(&auxdata, nfsi); + fscache_unuse_cookie(cookie, &auxdata, NULL); } +} - return 1; +static inline void fscache_end_operation(struct netfs_cache_resources *cres) +{ + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + + if (ops) + ops->end_operation(cres); } /* - * Release the caching state associated with a page if undergoing complete page - * invalidation. + * Fallback page reading interface. */ -void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +static int fscache_fallback_read_page(struct inode *inode, struct page *page) { + struct netfs_cache_resources cres; struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + int ret; - BUG_ON(!cookie); - - dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", - cookie, page, NFS_I(inode)); + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); - fscache_wait_on_page_write(cookie, page); + ret = fscache_begin_read_operation(&cres, cookie); + if (ret < 0) + return ret; - BUG_ON(!PageLocked(page)); - fscache_uncache_page(cookie, page); - nfs_inc_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); + ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, + NULL, NULL); + fscache_end_operation(&cres); + return ret; } /* - * Handle completion of a page being read from the cache. - * - Called in process (keventd) context. + * Fallback page writing interface. */ -static void nfs_readpage_from_fscache_complete(struct page *page, - void *context, - int error) +static int fscache_fallback_write_page(struct inode *inode, struct page *page, + bool no_space_allocated_yet) { - dfprintk(FSCACHE, - "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", - page, context, error); - - /* - * If the read completes with an error, mark the page with PG_checked, - * unlock the page, and let the VM reissue the readpage. - */ - if (!error) - SetPageUptodate(page); - else - SetPageChecked(page); - unlock_page(page); + struct netfs_cache_resources cres; + struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct iov_iter iter; + struct bio_vec bvec[1]; + loff_t start = page_offset(page); + size_t len = PAGE_SIZE; + int ret; + + memset(&cres, 0, sizeof(cres)); + bvec[0].bv_page = page; + bvec[0].bv_offset = 0; + bvec[0].bv_len = PAGE_SIZE; + iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + + ret = fscache_begin_write_operation(&cres, cookie); + if (ret < 0) + return ret; + + ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + no_space_allocated_yet); + if (ret == 0) + ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + fscache_end_operation(&cres); + return ret; } /* * Retrieve a page from fscache */ -int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, struct page *page) +int __nfs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; @@ -409,112 +328,49 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, nfs_i_fscache(inode), page, page->index, page->flags, inode); if (PageChecked(page)) { + dfprintk(FSCACHE, "NFS: readpage_from_fscache: PageChecked\n"); ClearPageChecked(page); return 1; } - ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), - page, - nfs_readpage_from_fscache_complete, - ctx, - GFP_KERNEL); - - switch (ret) { - case 0: /* read BIO submitted (page in fscache) */ - dfprintk(FSCACHE, - "NFS: readpage_from_fscache: BIO submitted\n"); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); - return ret; - - case -ENOBUFS: /* inode not in cache */ - case -ENODATA: /* page not in cache */ + ret = fscache_fallback_read_page(inode, page); + if (ret < 0) { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); dfprintk(FSCACHE, - "NFS: readpage_from_fscache %d\n", ret); - return 1; - - default: - dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - } - return ret; -} - -/* - * Retrieve a set of pages from fscache - */ -int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - unsigned npages = *nr_pages; - int ret; - - dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", - nfs_i_fscache(inode), npages, inode); - - ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), - mapping, pages, nr_pages, - nfs_readpage_from_fscache_complete, - ctx, - mapping_gfp_mask(mapping)); - if (*nr_pages < npages) - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, - npages); - if (*nr_pages > 0) - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, - *nr_pages); - - switch (ret) { - case 0: /* read submitted to the cache for all pages */ - BUG_ON(!list_empty(pages)); - BUG_ON(*nr_pages != 0); - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: submitted\n"); - + "NFS: readpage_from_fscache failed %d\n", ret); + SetPageChecked(page); return ret; - - case -ENOBUFS: /* some pages aren't cached and can't be */ - case -ENODATA: /* some pages aren't cached */ - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); - return 1; - - default: - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: ret %d\n", ret); } - return ret; + /* Read completed synchronously */ + dfprintk(FSCACHE, "NFS: readpage_from_fscache: read successful\n"); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); + SetPageUptodate(page); + return 0; } /* - * Store a newly fetched page in fscache - * - PG_fscache must be set on the page + * Store a newly fetched page in fscache. We can be certain there's no page + * stored in the cache as yet otherwise we would've read it from there. */ -void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; dfprintk(FSCACHE, - "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", - nfs_i_fscache(inode), page, page->index, page->flags, sync); + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx))\n", + nfs_i_fscache(inode), page, page->index, page->flags); + + ret = fscache_fallback_write_page(inode, page, true); - ret = fscache_write_page(nfs_i_fscache(inode), page, - inode->i_size, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); if (ret != 0) { - fscache_uncache_page(nfs_i_fscache(inode), page); - nfs_inc_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); } else { - nfs_inc_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_OK); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK); } } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 6754c8607230..25a5c0f82392 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -8,51 +8,16 @@ #ifndef _NFS_FSCACHE_H #define _NFS_FSCACHE_H +#include <linux/swap.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/fscache.h> +#include <linux/iversion.h> #ifdef CONFIG_NFS_FSCACHE /* - * set of NFS FS-Cache objects that form a superblock key - */ -struct nfs_fscache_key { - struct rb_node node; - struct nfs_client *nfs_client; /* the server */ - - /* the elements of the unique key - as used by nfs_compare_super() and - * nfs_compare_mount_options() to distinguish superblocks */ - struct { - struct { - unsigned long s_flags; /* various flags - * (& NFS_MS_MASK) */ - } super; - - struct { - struct nfs_fsid fsid; - int flags; - unsigned int rsize; /* read size */ - unsigned int wsize; /* write size */ - unsigned int acregmin; /* attr cache timeouts */ - unsigned int acregmax; - unsigned int acdirmin; - unsigned int acdirmax; - } nfs_server; - - struct { - rpc_authflavor_t au_flavor; - } rpc_auth; - - /* uniquifier - can be used if nfs_server.flags includes - * NFS_MOUNT_UNSHARED */ - u8 uniq_len; - char uniquifier[0]; - } key; -}; - -/* * Definition of the auxiliary data attached to NFS inode storage objects * within the cache. * @@ -70,84 +35,42 @@ struct nfs_fscache_inode_auxdata { }; /* - * fscache-index.c - */ -extern struct fscache_netfs nfs_fscache_netfs; -extern const struct fscache_cookie_def nfs_fscache_server_index_def; -extern const struct fscache_cookie_def nfs_fscache_super_index_def; -extern const struct fscache_cookie_def nfs_fscache_inode_object_def; - -extern int nfs_fscache_register(void); -extern void nfs_fscache_unregister(void); - -/* * fscache.c */ -extern void nfs_fscache_get_client_cookie(struct nfs_client *); -extern void nfs_fscache_release_client_cookie(struct nfs_client *); - -extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); +extern int nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); extern void nfs_fscache_init_inode(struct inode *); extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); +extern void nfs_fscache_release_file(struct inode *, struct file *); -extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); -extern int nfs_fscache_release_page(struct page *, gfp_t); +extern int __nfs_readpage_from_fscache(struct inode *, struct page *); +extern void __nfs_read_completion_to_fscache(struct nfs_pgio_header *hdr, + unsigned long bytes); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *); -extern int __nfs_readpage_from_fscache(struct nfs_open_context *, - struct inode *, struct page *); -extern int __nfs_readpages_from_fscache(struct nfs_open_context *, - struct inode *, struct address_space *, - struct list_head *, unsigned *); -extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); - -/* - * wait for a page to complete writing to the cache - */ -static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, - struct page *page) -{ - if (PageFsCache(page)) - fscache_wait_on_page_write(nfsi->fscache, page); -} - -/* - * release the caching state associated with a page if undergoing complete page - * invalidation - */ -static inline void nfs_fscache_invalidate_page(struct page *page, - struct inode *inode) +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { - if (PageFsCache(page)) - __nfs_fscache_invalidate_page(page, inode); + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + fscache_note_page_release(nfs_i_fscache(page->mapping->host)); + nfs_inc_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED); + } + return true; } /* * Retrieve a page from an inode data storage object. */ -static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, +static inline int nfs_readpage_from_fscache(struct inode *inode, struct page *page) { if (NFS_I(inode)->fscache) - return __nfs_readpage_from_fscache(ctx, inode, page); - return -ENOBUFS; -} - -/* - * Retrieve a set of pages from an inode data storage object. - */ -static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - if (NFS_I(inode)->fscache) - return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, - nr_pages); + return __nfs_readpage_from_fscache(inode, page); return -ENOBUFS; } @@ -156,27 +79,38 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, * in the cache. */ static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page, - int sync) + struct page *page) { - if (PageFsCache(page)) - __nfs_readpage_to_fscache(inode, page, sync); + if (NFS_I(inode)->fscache) + __nfs_readpage_to_fscache(inode, page); } -/* - * Invalidate the contents of fscache for this inode. This will not sleep. - */ -static inline void nfs_fscache_invalidate(struct inode *inode) +static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, + struct nfs_inode *nfsi) { - fscache_invalidate(NFS_I(inode)->fscache); + memset(auxdata, 0, sizeof(*auxdata)); + auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); } /* - * Wait for an object to finish being invalidated. + * Invalidate the contents of fscache for this inode. This will not sleep. */ -static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +static inline void nfs_fscache_invalidate(struct inode *inode, int flags) { - fscache_wait_on_invalidate(NFS_I(inode)->fscache); + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->fscache) { + nfs_fscache_update_auxdata(&auxdata, nfsi); + fscache_invalidate(nfsi->fscache, &auxdata, + i_size_read(&nfsi->vfs_inode), flags); + } } /* @@ -190,48 +124,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server) } #else /* CONFIG_NFS_FSCACHE */ -static inline int nfs_fscache_register(void) { return 0; } -static inline void nfs_fscache_unregister(void) {} - -static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} -static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} - static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} static inline void nfs_fscache_init_inode(struct inode *inode) {} static inline void nfs_fscache_clear_inode(struct inode *inode) {} static inline void nfs_fscache_open_file(struct inode *inode, struct file *filp) {} +static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {} static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { return 1; /* True: may release page */ } -static inline void nfs_fscache_invalidate_page(struct page *page, - struct inode *inode) {} -static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, - struct page *page) {} - -static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, +static inline int nfs_readpage_from_fscache(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page, int sync) {} + struct page *page) {} -static inline void nfs_fscache_invalidate(struct inode *inode) {} -static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} +static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fda530d5e764..d96baa4450e3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -209,7 +209,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + nfs_fscache_invalidate(inode, 0); flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); nfsi->cache_validity |= flags; @@ -853,12 +853,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, } /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME|STATX_MTIME)) && - S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; - } + if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); /* * We may force a getattr if the user cares about atime. @@ -1289,6 +1286,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map { int ret; + nfs_fscache_invalidate(inode, 0); if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { ret = nfs_sync_mapping(mapping); @@ -1300,7 +1298,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return ret; } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); - nfs_fscache_wait_on_invalidate(inode); dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, @@ -2374,10 +2371,6 @@ static int __init init_nfs_fs(void) if (err < 0) goto out9; - err = nfs_fscache_register(); - if (err < 0) - goto out8; - err = nfsiod_start(); if (err) goto out7; @@ -2429,8 +2422,6 @@ out5: out6: nfsiod_stop(); out7: - nfs_fscache_unregister(); -out8: unregister_pernet_subsys(&nfs_net_ops); out9: nfs_sysfs_exit(); @@ -2445,7 +2436,6 @@ static void __exit exit_nfs_fs(void) nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); - nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); rpc_proc_unregister(&init_net, "nfs"); unregister_nfs_fs(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 12f6acb483bb..2de7c56a1fbe 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -373,6 +373,7 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink, extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct user_namespace *, struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7100514d306b..1597eef40d54 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -220,7 +220,8 @@ static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, task_flags); } -static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs3_accessargs arg = { .fh = NFS_FH(inode), @@ -231,7 +232,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; int status = -ENOMEM; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 8b21ff1be717..32129446beca 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -46,7 +46,7 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); - u32 bitmask[3]; + u32 bitmask[NFS_BITMASK_SZ]; struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, @@ -69,9 +69,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, return status; } - memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask)); - if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED) - bitmask[1] |= FATTR4_WORD1_SPACE_USED; + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, inode, + NFS_INO_INVALID_BLOCKS); res.falloc_fattr = nfs_alloc_fattr(); if (!res.falloc_fattr) @@ -1044,13 +1043,14 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); struct nfs_server *server = NFS_SERVER(dst_inode); + __u32 dst_bitmask[NFS_BITMASK_SZ]; struct nfs42_clone_args args = { .src_fh = NFS_FH(src_inode), .dst_fh = NFS_FH(dst_inode), .src_offset = src_offset, .dst_offset = dst_offset, .count = count, - .dst_bitmask = server->cache_consistency_bitmask, + .dst_bitmask = dst_bitmask, }; struct nfs42_clone_res res = { .server = server, @@ -1079,6 +1079,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, if (!res.dst_fattr) return -ENOMEM; + nfs4_bitmask_set(dst_bitmask, server->cache_consistency_bitmask, + dst_inode, NFS_INO_INVALID_BLOCKS); + status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); trace_nfs4_clone(src_inode, dst_inode, &args, status); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ed5eaca6801e..84f39b6f1b1e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -260,8 +260,8 @@ struct nfs4_state_maintenance_ops { }; struct nfs4_mig_recovery_ops { - int (*get_locations)(struct inode *, struct nfs4_fs_locations *, - struct page *, const struct cred *); + int (*get_locations)(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, struct page *, const struct cred *); int (*fsid_present)(struct inode *, const struct cred *); }; @@ -280,7 +280,8 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, @@ -302,8 +303,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); -extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, - struct page *page, const struct cred *); +extern int nfs4_proc_get_locations(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, + struct page *page, const struct cred *); extern int nfs4_proc_fsid_present(struct inode *, const struct cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct dentry *, @@ -315,6 +317,8 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity); extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct inode *inode); extern int update_open_stateid(struct nfs4_state *state, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d8b5a250ca05..47a6cf892c95 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1343,8 +1343,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, } nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root))); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 873342308dc0..3680c8da510c 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -164,16 +164,21 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net, int port) { ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); - if (ret < 0) - ret = 0; + ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(net, string, len, sa, salen); + if (ret < 0) + ret = 0; + } + } else if (port) { + rpc_set_port(sa, port); } return ret; } @@ -328,7 +333,7 @@ static int try_location(struct fs_context *fc, nfs_parse_server_name(buf->data, buf->len, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address), - fc->net_ns); + fc->net_ns, 0); if (ctx->nfs_server.addrlen == 0) continue; @@ -496,7 +501,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net); + sap, addr_bufsize, net, 0); if (salen == 0) continue; rpc_set_port(sap, NFS_PORT); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ee3bc79f6ca3..0e0db6c27619 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -108,10 +108,6 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *, bool); #endif -static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], - const __u32 *src, struct inode *inode, - struct nfs_server *server, - struct nfs4_label *label); #ifdef CONFIG_NFS_V4_SECURITY_LABEL static inline struct nfs4_label * @@ -1233,8 +1229,7 @@ nfs4_update_changeattr_locked(struct inode *inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | - NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR | - NFS_INO_REVAL_PAGECACHE; + NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR; nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = jiffies; @@ -2653,9 +2648,8 @@ static int nfs4_opendata_access(const struct cred *cred, } else if ((fmode & FMODE_READ) && !opendata->file_created) mask = NFS4_ACCESS_READ; - cache.cred = cred; nfs_access_set_mask(&cache, opendata->o_res.access_result); - nfs_access_add_cache(state->inode, &cache); + nfs_access_add_cache(state->inode, &cache, cred); flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; if ((mask & ~cache.mask & flags) == 0) @@ -3670,7 +3664,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!nfs4_have_delegation(inode, FMODE_READ)) { nfs4_bitmask_set(calldata->arg.bitmask_store, server->cache_consistency_bitmask, - inode, server, NULL); + inode, 0); calldata->arg.bitmask = calldata->arg.bitmask_store; } else calldata->arg.bitmask = NULL; @@ -3841,7 +3835,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_FH_EXPIRE_TYPE | FATTR4_WORD0_LINK_SUPPORT | FATTR4_WORD0_SYMLINK_SUPPORT | - FATTR4_WORD0_ACLSUPPORT; + FATTR4_WORD0_ACLSUPPORT | + FATTR4_WORD0_CASE_INSENSITIVE | + FATTR4_WORD0_CASE_PRESERVING; if (minorversion) bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; @@ -3870,10 +3866,16 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.case_insensitive) + server->caps |= NFS_CAP_CASE_INSENSITIVE; + if (res.case_preserving) + server->caps |= NFS_CAP_CASE_PRESERVING; #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) server->caps |= NFS_CAP_SECURITY_LABEL; #endif + if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS) + server->caps |= NFS_CAP_FS_LOCATIONS; if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) @@ -3932,6 +3934,114 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) return err; } +static void test_fs_location_for_trunking(struct nfs4_fs_location *location, + struct nfs_client *clp, + struct nfs_server *server) +{ + int i; + + for (i = 0; i < location->nservers; i++) { + struct nfs4_string *srv_loc = &location->servers[i]; + struct sockaddr addr; + size_t addrlen; + struct xprt_create xprt_args = { + .ident = 0, + .net = clp->cl_net, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + char *servername = NULL; + + if (!srv_loc->len) + continue; + + addrlen = nfs_parse_server_name(srv_loc->data, srv_loc->len, + &addr, sizeof(addr), + clp->cl_net, server->port); + if (!addrlen) + return; + xprt_args.dstaddr = &addr; + xprt_args.addrlen = addrlen; + servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); + if (!servername) + return; + memcpy(servername, srv_loc->data, srv_loc->len); + servername[srv_loc->len] = '\0'; + xprt_args.servername = servername; + + xprtdata.cred = nfs4_get_clid_cred(clp); + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_cred(xprtdata.cred); + kfree(servername); + } +} + +static int _nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_fs_locations *locations = NULL; + struct page *page; + const struct cred *cred; + struct nfs_client *clp = server->nfs_client; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status = -ENOMEM, i; + + cred = ops->get_state_renewal_cred(clp); + if (cred == NULL) { + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + return -ENOKEY; + } + + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) + goto out; + + status = nfs4_proc_get_locations(server, fhandle, locations, page, + cred); + if (status) + goto out; + + for (i = 0; i < locations->nlocations; i++) + test_fs_location_for_trunking(&locations->locations[i], clp, + server); +out: + if (page) + __free_page(page); + kfree(locations); + return status; +} + +static int nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs_client *clp = server->nfs_client; + int err = 0; + + if (!nfs4_has_session(clp)) + goto out; + do { + err = nfs4_handle_exception(server, + _nfs4_discover_trunking(server, fhandle), + &exception); + } while (exception.retry); +out: + return err; +} + static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { @@ -4441,7 +4551,8 @@ static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, return err; } -static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { @@ -4455,7 +4566,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; int status = 0; @@ -4475,14 +4586,15 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry return status; } -static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_proc_access(inode, entry); + err = _nfs4_proc_access(inode, entry, cred); trace_nfs4_access(inode, err); err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); @@ -4663,8 +4775,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, nfs_fattr_init(res->dir_attr); - if (inode) + if (inode) { nfs4_inode_return_delegation(inode); + nfs_d_prune_case_insensitive_aliases(inode); + } } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4730,6 +4844,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 0; if (task->tk_status == 0) { + nfs_d_prune_case_insensitive_aliases(d_inode(data->old_dentry)); if (new_dir != old_dir) { /* Note: If we moved a directory, nlink will change */ nfs4_update_changeattr(old_dir, &res->old_cinfo, @@ -5422,14 +5537,14 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, - struct inode *inode, struct nfs_server *server, - struct nfs4_label *label) +void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity) { - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + struct nfs_server *server = NFS_SERVER(inode); unsigned int i; memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ); + cache_validity |= READ_ONCE(NFS_I(inode)->cache_validity); if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; @@ -5441,8 +5556,6 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP; if (cache_validity & NFS_INO_INVALID_NLINK) bitmask[1] |= FATTR4_WORD1_NUMLINKS; - if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) - bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; if (cache_validity & NFS_INO_INVALID_CTIME) bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) @@ -5469,7 +5582,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, } else { nfs4_bitmask_set(hdr->args.bitmask_store, server->cache_consistency_bitmask, - hdr->inode, server, NULL); + hdr->inode, NFS_INO_INVALID_BLOCKS); hdr->args.bitmask = hdr->args.bitmask_store; } @@ -6507,8 +6620,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; nfs4_bitmask_set(data->args.bitmask_store, - server->cache_consistency_bitmask, inode, server, - NULL); + server->cache_consistency_bitmask, inode, 0); data->args.bitmask = data->args.bitmask_store; nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); @@ -7611,7 +7723,7 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - struct nfs_access_entry cache; + u32 mask; int ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) @@ -7626,8 +7738,8 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, * do a cached access check for the XA* flags to possibly avoid * doing an RPC and getting EACCES back. */ - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XAWRITE)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAWRITE)) return -EACCES; } @@ -7648,14 +7760,14 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - struct nfs_access_entry cache; + u32 mask; ssize_t ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XAREAD)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAREAD)) return -EACCES; } @@ -7680,13 +7792,13 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) ssize_t ret, size; char *buf; size_t buflen; - struct nfs_access_entry cache; + u32 mask; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return 0; - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XALIST)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XALIST)) return 0; } @@ -7818,18 +7930,18 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, * appended to this compound to identify the client ID which is * performing recovery. */ -static int _nfs40_proc_get_locations(struct inode *inode, +static int _nfs40_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { .clientid = server->nfs_client->cl_clientid, - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -7875,17 +7987,17 @@ static int _nfs40_proc_get_locations(struct inode *inode, * When the client supports GETATTR(fs_locations_info), it can * be plumbed in here. */ -static int _nfs41_proc_get_locations(struct inode *inode, +static int _nfs41_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -7919,7 +8031,8 @@ static int _nfs41_proc_get_locations(struct inode *inode, /** * nfs4_proc_get_locations - discover locations for a migrated FSID - * @inode: inode on FSID that is migrating + * @server: pointer to nfs_server to process + * @fhandle: pointer to the kernel NFS client file handle * @locations: result of query * @page: buffer * @cred: credential to use for this operation @@ -7934,11 +8047,11 @@ static int _nfs41_proc_get_locations(struct inode *inode, * -NFS4ERR_LEASE_MOVED is returned if the server still has leases * from this client that require migration recovery. */ -int nfs4_proc_get_locations(struct inode *inode, +int nfs4_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; @@ -7951,10 +8064,11 @@ int nfs4_proc_get_locations(struct inode *inode, (unsigned long long)server->fsid.major, (unsigned long long)server->fsid.minor, clp->cl_hostname); - nfs_display_fhandle(NFS_FH(inode), __func__); + nfs_display_fhandle(fhandle, __func__); do { - status = ops->get_locations(inode, locations, page, cred); + status = ops->get_locations(server, fhandle, locations, page, + cred); if (status != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, status, &exception); @@ -10423,6 +10537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .free_client = nfs4_free_client, .create_server = nfs4_create_server, .clone_server = nfs_clone_server, + .discover_trunking = nfs4_discover_trunking, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f63dfa01001c..f5a62c0d999b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2098,7 +2098,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } inode = d_inode(server->super->s_root); - result = nfs4_proc_get_locations(inode, locations, page, cred); + result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, + page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", __func__, result); @@ -2106,6 +2107,9 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } result = -NFS4ERR_NXIO; + if (!locations->nlocations) + goto out; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); @@ -2693,6 +2697,6 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); nfs4_state_manager(clp); nfs_put_client(clp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 69862bf6db00..8e70b92df4cc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3533,6 +3533,42 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint return 0; } +static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_INSENSITIVE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_INSENSITIVE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE; + } + dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +} + +static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_PRESERVING - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_PRESERVING)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING; + } + dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +} + static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; @@ -3696,8 +3732,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(!p)) goto out_eio; n = be32_to_cpup(p); - if (n <= 0) - goto out_eio; for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; @@ -4200,10 +4234,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; } @@ -4412,6 +4447,10 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_case_insensitive(xdr, bitmap, &res->case_insensitive)) != 0) + goto xdr_error; + if ((status = decode_attr_case_preserving(xdr, bitmap, &res->case_preserving)) != 0) + goto xdr_error; if ((status = decode_attr_exclcreat_supported(xdr, bitmap, res->exclcreat_bitmask)) != 0) goto xdr_error; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index b3aee261801e..317ce27bdc4b 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -42,7 +42,6 @@ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \ - { BIT(NFS_INO_FSCACHE_LOCK), "FSCACHE_LOCK" }, \ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d11af2a9299c..eb00229c1a50 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error) struct address_space *mapping = page_file_mapping(page); if (PageUptodate(page)) - nfs_readpage_to_fscache(inode, page, 0); + nfs_readpage_to_fscache(inode, page); else if (!PageError(page) && !PagePrivate(page)) generic_error_remove_page(mapping, page); unlock_page(page); @@ -305,6 +305,12 @@ readpage_async_filler(void *data, struct page *page) aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); + if (!IS_SYNC(page->mapping->host)) { + error = nfs_readpage_from_fscache(page->mapping->host, page); + if (error == 0) + goto out_unlock; + } + new = nfs_create_request(desc->ctx, page, 0, aligned_len); if (IS_ERR(new)) goto out_error; @@ -320,6 +326,7 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); +out_unlock: unlock_page(page); out: return error; @@ -366,12 +373,6 @@ int nfs_readpage(struct file *file, struct page *page) desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); xchg(&desc.ctx->error, 0); - if (!IS_SYNC(inode)) { - ret = nfs_readpage_from_fscache(desc.ctx, inode, page); - if (ret == 0) - goto out_wait; - } - nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); @@ -381,7 +382,6 @@ int nfs_readpage(struct file *file, struct page *page) nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; -out_wait: if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) @@ -419,14 +419,6 @@ int nfs_readpages(struct file *file, struct address_space *mapping, } else desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); - /* attempt to read as many of the pages as possible from the cache - * - this returns -ENOBUFS immediately if the cookie is negative - */ - ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, - pages, &nr_pages); - if (ret == 0) - goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); @@ -434,7 +426,6 @@ int nfs_readpages(struct file *file, struct address_space *mapping, nfs_pageio_complete_read(&desc.pgio); -read_complete: put_nfs_open_context(desc.ctx); out: trace_nfs_aop_readahead_done(inode, nr_pages, ret); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3aced401735c..6ab5eeb000dc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1204,42 +1204,42 @@ static int nfs_compare_super(struct super_block *sb, struct fs_context *fc) } #ifdef CONFIG_NFS_FSCACHE -static void nfs_get_cache_cookie(struct super_block *sb, - struct nfs_fs_context *ctx) +static int nfs_get_cache_cookie(struct super_block *sb, + struct nfs_fs_context *ctx) { struct nfs_server *nfss = NFS_SB(sb); char *uniq = NULL; int ulen = 0; - nfss->fscache_key = NULL; nfss->fscache = NULL; if (!ctx) - return; + return 0; if (ctx->clone_data.sb) { struct nfs_server *mnt_s = NFS_SB(ctx->clone_data.sb); if (!(mnt_s->options & NFS_OPTION_FSCACHE)) - return; - if (mnt_s->fscache_key) { - uniq = mnt_s->fscache_key->key.uniquifier; - ulen = mnt_s->fscache_key->key.uniq_len; + return 0; + if (mnt_s->fscache_uniq) { + uniq = mnt_s->fscache_uniq; + ulen = strlen(uniq); } } else { if (!(ctx->options & NFS_OPTION_FSCACHE)) - return; + return 0; if (ctx->fscache_uniq) { uniq = ctx->fscache_uniq; ulen = strlen(ctx->fscache_uniq); } } - nfs_fscache_get_super_cookie(sb, uniq, ulen); + return nfs_fscache_get_super_cookie(sb, uniq, ulen); } #else -static void nfs_get_cache_cookie(struct super_block *sb, - struct nfs_fs_context *ctx) +static int nfs_get_cache_cookie(struct super_block *sb, + struct nfs_fs_context *ctx) { + return 0; } #endif @@ -1299,7 +1299,9 @@ int nfs_get_tree_common(struct fs_context *fc) s->s_blocksize_bits = bsize; s->s_blocksize = 1U << bsize; } - nfs_get_cache_cookie(s, ctx); + error = nfs_get_cache_cookie(s, ctx); + if (error < 0) + goto error_splat_super; } error = nfs_get_root(s, fc); diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 8cb70755e3c9..a6f740366963 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -142,10 +142,11 @@ static struct attribute *nfs_netns_client_attrs[] = { &nfs_netns_client_id.attr, NULL, }; +ATTRIBUTE_GROUPS(nfs_netns_client); static struct kobj_type nfs_netns_client_type = { .release = nfs_netns_client_release, - .default_attrs = nfs_netns_client_attrs, + .default_groups = nfs_netns_client_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = nfs_netns_client_namespace, }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9b7619ce17a7..987a187bd39a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -294,6 +294,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: spin_unlock(&inode->i_lock); + nfs_fscache_invalidate(inode, 0); } /* A writeback failed: mark the page as bad, and invalidate the page cache */ @@ -2125,8 +2126,11 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, if (PagePrivate(page)) return -EBUSY; - if (!nfs_fscache_release_page(page, GFP_KERNEL)) - return -EBUSY; + if (PageFsCache(page)) { + if (mode == MIGRATE_ASYNC) + return -EBUSY; + wait_on_page_fscache(page); + } return migrate_page(mapping, newpage, page, mode); } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 9421dae22737..668c7527b17e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -427,7 +427,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } - if (mnt_user_ns(path->mnt) != &init_user_ns) { + if (is_idmapped_mnt(path->mnt)) { dprintk("exp_export: export of idmapped mounts not yet supported.\n"); return -EINVAL; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index fdf89fcf1a0c..8bc807c5fea4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -44,12 +44,9 @@ struct nfsd_fcache_bucket { static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); struct nfsd_fcache_disposal { - struct list_head list; struct work_struct work; - struct net *net; spinlock_t lock; struct list_head freeme; - struct rcu_head rcu; }; static struct workqueue_struct *nfsd_filecache_wq __read_mostly; @@ -62,8 +59,6 @@ static long nfsd_file_lru_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; -static DEFINE_SPINLOCK(laundrette_lock); -static LIST_HEAD(laundrettes); static void nfsd_file_gc(void); @@ -194,7 +189,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); } nf->nf_mark = NULL; - init_rwsem(&nf->nf_rwsem); trace_nfsd_file_alloc(nf); } return nf; @@ -249,7 +243,7 @@ nfsd_file_do_unhash(struct nfsd_file *nf) trace_nfsd_file_unhash(nf); if (nfsd_file_check_write_error(nf)) - nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; hlist_del_rcu(&nf->nf_node); atomic_long_dec(&nfsd_filecache_count); @@ -367,19 +361,13 @@ nfsd_file_list_remove_disposal(struct list_head *dst, static void nfsd_file_list_add_disposal(struct list_head *files, struct net *net) { - struct nfsd_fcache_disposal *l; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; - rcu_read_lock(); - list_for_each_entry_rcu(l, &laundrettes, list) { - if (l->net == net) { - spin_lock(&l->lock); - list_splice_tail_init(files, &l->freeme); - spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); - break; - } - } - rcu_read_unlock(); + spin_lock(&l->lock); + list_splice_tail_init(files, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); } static void @@ -755,7 +743,7 @@ nfsd_file_cache_purge(struct net *net) } static struct nfsd_fcache_disposal * -nfsd_alloc_fcache_disposal(struct net *net) +nfsd_alloc_fcache_disposal(void) { struct nfsd_fcache_disposal *l; @@ -763,7 +751,6 @@ nfsd_alloc_fcache_disposal(struct net *net) if (!l) return NULL; INIT_WORK(&l->work, nfsd_file_delayed_close); - l->net = net; spin_lock_init(&l->lock); INIT_LIST_HEAD(&l->freeme); return l; @@ -772,61 +759,27 @@ nfsd_alloc_fcache_disposal(struct net *net) static void nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) { - rcu_assign_pointer(l->net, NULL); cancel_work_sync(&l->work); nfsd_file_dispose_list(&l->freeme); - kfree_rcu(l, rcu); -} - -static void -nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l) -{ - spin_lock(&laundrette_lock); - list_add_tail_rcu(&l->list, &laundrettes); - spin_unlock(&laundrette_lock); -} - -static void -nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l) -{ - spin_lock(&laundrette_lock); - list_del_rcu(&l->list); - spin_unlock(&laundrette_lock); -} - -static int -nfsd_alloc_fcache_disposal_net(struct net *net) -{ - struct nfsd_fcache_disposal *l; - - l = nfsd_alloc_fcache_disposal(net); - if (!l) - return -ENOMEM; - nfsd_add_fcache_disposal(l); - return 0; + kfree(l); } static void nfsd_free_fcache_disposal_net(struct net *net) { - struct nfsd_fcache_disposal *l; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; - rcu_read_lock(); - list_for_each_entry_rcu(l, &laundrettes, list) { - if (l->net != net) - continue; - nfsd_del_fcache_disposal(l); - rcu_read_unlock(); - nfsd_free_fcache_disposal(l); - return; - } - rcu_read_unlock(); + nfsd_free_fcache_disposal(l); } int nfsd_file_cache_start_net(struct net *net) { - return nfsd_alloc_fcache_disposal_net(net); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->fcache_disposal = nfsd_alloc_fcache_disposal(); + return nn->fcache_disposal ? 0 : -ENOMEM; } void diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 7872df5a0fe3..435ceab27897 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -46,7 +46,6 @@ struct nfsd_file { refcount_t nf_ref; unsigned char nf_may; struct nfsd_file_mark *nf_mark; - struct rw_semaphore nf_rwsem; }; int nfsd_file_cache_init(void); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 935c1028c217..1b1a962a1804 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,6 +11,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/percpu_counter.h> +#include <linux/siphash.h> /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -108,9 +109,8 @@ struct nfsd_net { bool nfsd_net_up; bool lockd_up; - /* Time of server startup */ - struct timespec64 nfssvc_boot; - seqlock_t boot_lock; + seqlock_t writeverf_lock; + unsigned char writeverf[8]; /* * Max number of connections this nfsd container will allow. Defaults @@ -123,12 +123,13 @@ struct nfsd_net { u32 clverifier_counter; struct svc_serv *nfsd_serv; - - wait_queue_head_t ntf_wq; - atomic_t ntf_refcnt; - - /* Allow umount to wait for nfsd state cleanup */ - struct completion nfsd_shutdown_complete; + /* When a listening socket is added to nfsd, keep_active is set + * and this justifies a reference on nfsd_serv. This stops + * nfsd_serv from being freed. When the number of threads is + * set, keep_active is cleared and the reference is dropped. So + * when the last thread exits, the service will be destroyed. + */ + int keep_active; /* * clientid and stateid data for construction of net unique COPY @@ -184,6 +185,10 @@ struct nfsd_net { /* utsname taken from the process that starts the server */ char nfsd_name[UNX_MAXNODENAME+1]; + + struct nfsd_fcache_disposal *fcache_disposal; + + siphash_key_t siphash_key; }; /* Simple check to find out if a given net was properly initialized */ @@ -193,6 +198,6 @@ extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; -void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); -void nfsd_reset_boot_verifier(struct nfsd_net *nn); +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_write_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 15dac36ca852..936eebd4c56d 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -150,13 +150,17 @@ nfsd3_proc_read(struct svc_rqst *rqstp) unsigned int len; int v; - argp->count = min_t(u32, argp->count, max_blocksize); - dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, (unsigned long long) argp->offset); + argp->count = min_t(u32, argp->count, max_blocksize); + if (argp->offset > (u64)OFFSET_MAX) + argp->offset = (u64)OFFSET_MAX; + if (argp->offset + argp->count > (u64)OFFSET_MAX) + argp->count = (u64)OFFSET_MAX - argp->offset; + v = 0; len = argp->count; resp->pages = rqstp->rq_next_page; @@ -199,18 +203,19 @@ nfsd3_proc_write(struct svc_rqst *rqstp) (unsigned long long) argp->offset, argp->stable? " stable" : ""); + resp->status = nfserr_fbig; + if (argp->offset > (u64)OFFSET_MAX || + argp->offset + argp->len > (u64)OFFSET_MAX) + return rpc_success; + fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, nvecs, &cnt, resp->committed, resp->verf); resp->count = cnt; -out: return rpc_success; } @@ -655,15 +660,9 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) argp->count, (unsigned long long) argp->offset); - if (argp->offset > NFS_OFFSET_MAX) { - resp->status = nfserr_inval; - goto out; - } - fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count, resp->verf); -out: return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index c3ac1b6aa3aa..0293b8d65f10 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -254,7 +254,7 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (xdr_stream_decode_u64(xdr, &newsize) < 0) return false; iap->ia_valid |= ATTR_SIZE; - iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); + iap->ia_size = newsize; } if (xdr_stream_decode_u32(xdr, &set_it) < 0) return false; @@ -487,71 +487,6 @@ neither: return true; } -static bool fs_supports_change_attribute(struct super_block *sb) -{ - return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; -} - -/* - * Fill in the pre_op attr for the wcc data - */ -void fill_pre_wcc(struct svc_fh *fhp) -{ - struct inode *inode; - struct kstat stat; - bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - - if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; - inode = d_inode(fhp->fh_dentry); - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &stat); - - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - } - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; - } - if (v4) - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - - fhp->fh_pre_saved = true; -} - -/* - * Fill in the post_op attr for the wcc data - */ -void fill_post_wcc(struct svc_fh *fhp) -{ - bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - struct inode *inode = d_inode(fhp->fh_dentry); - - if (fhp->fh_no_wcc) - return; - - if (fhp->fh_post_saved) - printk("nfsd: inode locked twice during operation.\n"); - - fhp->fh_post_saved = true; - - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); - - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } - } - if (v4) - fhp->fh_post_change = - nfsd4_change_attribute(&fhp->fh_post_attr, inode); -} - /* * XDR decode functions */ @@ -1125,7 +1060,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, return false; /* cookie */ resp->cookie_offset = dirlist->len; - if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0) + if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0) return false; return true; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a36261f89bdf..b207c76a873f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -598,7 +598,7 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); - nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); + nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 @@ -782,12 +782,16 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; read->rd_nf = NULL; - if (read->rd_offset >= OFFSET_MAX) - return nfserr_inval; trace_nfsd_read_start(rqstp, &cstate->current_fh, read->rd_offset, read->rd_length); + read->rd_length = min_t(u32, read->rd_length, svc_max_payload(rqstp)); + if (read->rd_offset > (u64)OFFSET_MAX) + read->rd_offset = (u64)OFFSET_MAX; + if (read->rd_offset + read->rd_length > (u64)OFFSET_MAX) + read->rd_length = (u64)OFFSET_MAX - read->rd_offset; + /* * If we do a zero copy read, then a client will see read data * that reflects the state of the file *after* performing the @@ -1018,8 +1022,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned long cnt; int nvecs; - if (write->wr_offset >= OFFSET_MAX) - return nfserr_inval; + if (write->wr_offset > (u64)OFFSET_MAX || + write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) + return nfserr_fbig; cnt = write->wr_buflen; trace_nfsd_write_start(rqstp, &cstate->current_fh, @@ -1101,7 +1106,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, + status = nfsd4_clone_file_range(rqstp, src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count, EX_ISSYNC(cstate->current_fh.fh_export)); @@ -1510,11 +1515,14 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) { + struct file *dst = copy->nf_dst->nf_file; + struct file *src = copy->nf_src->nf_file; + errseq_t since; ssize_t bytes_copied = 0; u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; - __be32 status; + int status; /* See RFC 7862 p.67: */ if (bytes_total == 0) @@ -1522,9 +1530,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, - src_pos, copy->nf_dst->nf_file, dst_pos, - bytes_total); + bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1534,11 +1541,11 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) } while (bytes_total > 0 && !copy->cp_synchronous); /* for a non-zero asynchronous copy do a commit of data */ if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) { - down_write(©->nf_dst->nf_rwsem); - status = vfs_fsync_range(copy->nf_dst->nf_file, - copy->cp_dst_pos, + since = READ_ONCE(dst->f_wb_err); + status = vfs_fsync_range(dst, copy->cp_dst_pos, copy->cp_res.wr_bytes_written, 0); - up_write(©->nf_dst->nf_rwsem); + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); if (!status) copy->committed = true; } @@ -2528,7 +2535,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto encode_op; } - fh_clear_wcc(current_fh); + fh_clear_pre_post_attrs(current_fh); /* If op is non-idempotent */ if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1956d377d1a6..32063733443d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -246,6 +246,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); + WARN_ON(list_empty(&cur->nbl_lru)); list_del_init(&cur->nbl_lru); found = cur; break; @@ -271,6 +272,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); + kref_init(&nbl->nbl_kref); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, &nfsd4_cb_notify_lock_ops, NFSPROC4_CLNT_CB_NOTIFY_LOCK); @@ -280,11 +282,20 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, } static void +free_nbl(struct kref *kref) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); + kfree(nbl); +} + +static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); locks_release_private(&nbl->nbl_lock); - kfree(nbl); + kref_put(&nbl->nbl_kref, free_nbl); } static void @@ -302,6 +313,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo) struct nfsd4_blocked_lock, nbl_list); list_del_init(&nbl->nbl_list); + WARN_ON(list_empty(&nbl->nbl_lru)); list_move(&nbl->nbl_lru, &reaplist); } spin_unlock(&nn->blocked_locks_lock); @@ -360,11 +372,13 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { * st_{access,deny}_bmap field of the stateid, in order to track not * only what share bits are currently in force, but also what * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. + * to enforce the recommendation in + * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that + * the server return an error if the client attempt to downgrade to a + * combination of share bits not explicable by closing some of its + * previous opens. * - * XXX: This enforcement is actually incomplete, since we don't keep + * This enforcement is arguably incomplete, since we don't keep * track of access/deny bit combinations; so, e.g., we allow: * * OPEN allow read, deny write @@ -372,6 +386,10 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { * DOWNGRADE allow read, deny none * * which we should reject. + * + * But you could also argue that our current code is already overkill, + * since it only exists to return NFS4ERR_INVAL on incorrect client + * behavior. */ static unsigned int bmap_to_share_mode(unsigned long bmap) @@ -4112,8 +4130,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, - &old->cl_cred)) + &old->cl_cred)) { + old = NULL; goto out; + } status = mark_client_expired_locked(old); if (status) { old = NULL; @@ -6040,7 +6060,11 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - status = check_special_stateids(net, fhp, stateid, flags); + if (cstid) + status = nfserr_bad_stateid; + else + status = check_special_stateids(net, fhp, stateid, + flags); goto done; } @@ -6836,7 +6860,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; - struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6858,7 +6881,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } - sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6910,8 +6932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) && - !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) + if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6923,8 +6944,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) && - !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) + if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -6945,6 +6965,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + /* + * Most filesystems with their own ->lock operations will block + * the nfsd thread waiting to acquire the lock. That leads to + * deadlocks (we don't want every nfsd thread tied up waiting + * for file locks), so don't attempt blocking lock notifications + * on those filesystems: + */ + if (nf->nf_file->f_op->lock) + fl_flags &= ~FL_SLEEP; + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { dprintk("NFSD: %s: unable to allocate block!\n", __func__); @@ -6975,6 +7005,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + kref_get(&nbl->nbl_kref); spin_unlock(&nn->blocked_locks_lock); } @@ -6987,6 +7018,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: + kref_put(&nbl->nbl_kref, free_nbl); nbl = NULL; fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ @@ -7007,8 +7039,13 @@ out: /* dequeue it if we queued it before */ if (fl_flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); - list_del_init(&nbl->nbl_list); - list_del_init(&nbl->nbl_lru); + if (!list_empty(&nbl->nbl_list) && + !list_empty(&nbl->nbl_lru)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + kref_put(&nbl->nbl_kref, free_nbl); + } + /* nbl can use one of lists to be linked to reaplist */ spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5a93a5db4fb0..714a3a3bd50c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -277,21 +277,10 @@ nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf) static __be32 nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) { - u32 i, count; - __be32 *p; - - if (xdr_stream_decode_u32(argp->xdr, &count) < 0) - return nfserr_bad_xdr; - /* request sanity */ - if (count > 1000) - return nfserr_bad_xdr; - p = xdr_inline_decode(argp->xdr, count << 2); - if (!p) - return nfserr_bad_xdr; - for (i = 0; i < bmlen; i++) - bmval[i] = (i < count) ? be32_to_cpup(p++) : 0; + ssize_t status; - return nfs_ok; + status = xdr_stream_decode_uint32_array(argp->xdr, bmval, bmlen); + return status == -EBADMSG ? nfserr_bad_xdr : nfs_ok; } static __be32 @@ -3506,7 +3495,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, p = xdr_reserve_space(xdr, 3*4 + namlen); if (!p) goto fail; - p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); @@ -3997,10 +3986,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } xdr_commit_encode(xdr); - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, + maxcount = min_t(unsigned long, read->rd_length, (xdr->buf->buflen - xdr->buf->len)); - maxcount = min_t(unsigned long, maxcount, read->rd_length); if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) @@ -4804,8 +4791,8 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, return nfserr_resource; *p++ = htonl(NFS4_CONTENT_HOLE); - p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + p = xdr_encode_hyper(p, read->rd_offset); + p = xdr_encode_hyper(p, count); *eof = (read->rd_offset + count) >= f_size; *maxcount = min_t(unsigned long, count, *maxcount); @@ -4837,10 +4824,8 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr_resource; xdr_commit_encode(xdr); - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, + maxcount = min_t(unsigned long, read->rd_length, (xdr->buf->buflen - xdr->buf->len)); - maxcount = min_t(unsigned long, maxcount, read->rd_length); count = maxcount; eof = read->rd_offset >= i_size_read(file_inode(file)); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6e0b6f3148dc..a4a69ab6ab28 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -87,7 +87,7 @@ nfsd_hashsize(unsigned int limit) static u32 nfsd_cache_hash(__be32 xid, struct nfsd_net *nn) { - return hash_32(be32_to_cpu(xid), nn->maskbits); + return hash_32((__force u32)xid, nn->maskbits); } static struct svc_cacherep * diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 51a49e0cfe37..68b020f2002b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -742,13 +742,12 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred return err; err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err < 0) { - nfsd_destroy(net); - return err; - } - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; + if (err >= 0 && + !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + svc_get(nn->nfsd_serv); + + nfsd_put(net); return err; } @@ -783,8 +782,10 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (err < 0 && err != -EAFNOSUPPORT) goto out_close; - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; + if (!nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + svc_get(nn->nfsd_serv); + + nfsd_put(net); return 0; out_close: xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); @@ -793,10 +794,7 @@ out_close: svc_xprt_put(xprt); } out_err: - if (!list_empty(&nn->nfsd_serv->sv_permsocks)) - nn->nfsd_serv->sv_nrthreads--; - else - nfsd_destroy(net); + nfsd_put(net); return err; } @@ -1249,7 +1247,8 @@ static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) clear_ncl(d_inode(dentry)); dget(dentry); ret = simple_unlink(dir, dentry); - d_delete(dentry); + d_drop(dentry); + fsnotify_unlink(dir, dentry); dput(dentry); WARN_ON_ONCE(ret); } @@ -1340,8 +1339,8 @@ void nfsd_client_rmdir(struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); WARN_ON_ONCE(ret); + d_drop(dentry); fsnotify_rmdir(dir, dentry); - d_delete(dentry); dput(dentry); inode_unlock(dir); } @@ -1485,9 +1484,8 @@ static __net_init int nfsd_init_net(struct net *net) nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; - atomic_set(&nn->ntf_refcnt, 0); - init_waitqueue_head(&nn->ntf_wq); - seqlock_init(&nn->boot_lock); + get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); + seqlock_init(&nn->writeverf_lock); return 0; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 498e5a489826..3e5008b475ff 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -97,7 +97,7 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_destroy(struct net *net); +void nfsd_put(struct net *net); bool i_am_nfsd(void); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f3779fa72c89..145208bcb9bd 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -611,6 +611,70 @@ out_negative: return nfserr_serverfault; } +#ifdef CONFIG_NFSD_V3 + +/** + * fh_fill_pre_attrs - Fill in pre-op attributes + * @fhp: file handle to be updated + * + */ +void fh_fill_pre_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode; + struct kstat stat; + __be32 err; + + if (fhp->fh_no_wcc || fhp->fh_pre_saved) + return; + + inode = d_inode(fhp->fh_dentry); + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + if (v4) + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; + fhp->fh_pre_saved = true; +} + +/** + * fh_fill_post_attrs - Fill in post-op attributes + * @fhp: file handle to be updated + * + */ +void fh_fill_post_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode = d_inode(fhp->fh_dentry); + __be32 err; + + if (fhp->fh_no_wcc) + return; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = fh_getattr(fhp, &fhp->fh_post_attr); + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } else + fhp->fh_post_saved = true; + if (v4) + fhp->fh_post_change = + nfsd4_change_attribute(&fhp->fh_post_attr, inode); +} + +#endif /* CONFIG_NFSD_V3 */ + /* * Release a file handle. */ @@ -623,7 +687,7 @@ fh_put(struct svc_fh *fhp) fh_unlock(fhp); fhp->fh_dentry = NULL; dput(dentry); - fh_clear_wcc(fhp); + fh_clear_pre_post_attrs(fhp); } fh_drop_write(fhp); if (exp) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index d11e4b6870d6..434930d8a946 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -284,12 +284,13 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) #endif #ifdef CONFIG_NFSD_V3 -/* - * The wcc data stored in current_fh should be cleared - * between compound ops. + +/** + * fh_clear_pre_post_attrs - Reset pre/post attributes + * @fhp: file handle to be updated + * */ -static inline void -fh_clear_wcc(struct svc_fh *fhp) +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) { fhp->fh_post_saved = false; fhp->fh_pre_saved = false; @@ -323,13 +324,24 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat, return time_to_chattr(&stat->ctime); } -extern void fill_pre_wcc(struct svc_fh *fhp); -extern void fill_post_wcc(struct svc_fh *fhp); -#else -#define fh_clear_wcc(ignored) -#define fill_pre_wcc(ignored) -#define fill_post_wcc(notused) -#endif /* CONFIG_NFSD_V3 */ +extern void fh_fill_pre_attrs(struct svc_fh *fhp); +extern void fh_fill_post_attrs(struct svc_fh *fhp); + +#else /* !CONFIG_NFSD_V3 */ + +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) +{ +} + +static inline void fh_fill_pre_attrs(struct svc_fh *fhp) +{ +} + +static inline void fh_fill_post_attrs(struct svc_fh *fhp) +{ +} + +#endif /* !CONFIG_NFSD_V3 */ /* @@ -355,7 +367,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) inode = d_inode(dentry); inode_lock_nested(inode, subclass); - fill_pre_wcc(fhp); + fh_fill_pre_attrs(fhp); fhp->fh_locked = true; } @@ -372,7 +384,7 @@ static inline void fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { - fill_post_wcc(fhp); + fh_fill_post_attrs(fhp); inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index de282f3273c5..18b8eb43a19b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -235,10 +235,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) argp->len, argp->offset); nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, nvecs, @@ -247,7 +243,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) return rpc_drop_reply; -out: return rpc_success; } @@ -850,6 +845,7 @@ nfserrno (int errno) { nfserr_io, -EIO }, { nfserr_nxio, -ENXIO }, { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, @@ -878,6 +874,8 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, { nfserr_no_grace, -ENOGRACE}, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80431921e5d7..b8c682b62d29 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> +#include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> @@ -55,18 +56,17 @@ static __be32 nfsd_init_request(struct svc_rqst *, struct svc_process_info *); /* - * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members - * of the svc_serv struct. In particular, ->sv_nrthreads but also to some - * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members + * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a - * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number - * of nfsd threads must exist and each must listed in ->sp_all_threads in each - * entry of ->sv_pools[]. + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless + * nn->keep_active is set). That number of nfsd threads must + * exist and each must be listed in ->sp_all_threads in some entry of + * ->sv_pools[]. * - * Transitions of the thread count between zero and non-zero are of particular - * interest since the svc_serv needs to be created and initialized at that - * point, or freed. + * Each active thread holds a counted reference on nn->nfsd_serv, as does + * the nn->keep_active flag and various transient calls to svc_get(). * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in @@ -345,33 +345,57 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } -void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +/** + * nfsd_copy_write_verifier - Atomically copy a write verifier + * @verf: buffer in which to receive the verifier cookie + * @nn: NFS net namespace + * + * This function provides a wait-free mechanism for copying the + * namespace's write verifier without tearing it. + */ +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { int seq = 0; do { - read_seqbegin_or_lock(&nn->boot_lock, &seq); - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - } while (need_seqretry(&nn->boot_lock, seq)); - done_seqretry(&nn->boot_lock, seq); + read_seqbegin_or_lock(&nn->writeverf_lock, &seq); + memcpy(verf, nn->writeverf, sizeof(*verf)); + } while (need_seqretry(&nn->writeverf_lock, seq)); + done_seqretry(&nn->writeverf_lock, seq); } -static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { - ktime_get_real_ts64(&nn->nfssvc_boot); + struct timespec64 now; + u64 verf; + + /* + * Because the time value is hashed, y2038 time_t overflow + * is irrelevant in this usage. + */ + ktime_get_raw_ts64(&now); + verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); + memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } -void nfsd_reset_boot_verifier(struct nfsd_net *nn) +/** + * nfsd_reset_write_verifier - Generate a new write verifier + * @nn: NFS net namespace + * + * This function updates the ->writeverf field of @nn. This field + * contains an opaque cookie that, according to Section 18.32.3 of + * RFC 8881, "the client can use to determine whether a server has + * changed instance state (e.g., server restart) between a call to + * WRITE and a subsequent call to either WRITE or COMMIT. This + * cookie MUST be unchanged during a single instance of the NFSv4.1 + * server and MUST be unique between instances of the NFSv4.1 + * server." + */ +void nfsd_reset_write_verifier(struct nfsd_net *nn) { - write_seqlock(&nn->boot_lock); - nfsd_reset_boot_verifier_locked(nn); - write_sequnlock(&nn->boot_lock); + write_seqlock(&nn->writeverf_lock); + nfsd_reset_write_verifier_locked(nn); + write_sequnlock(&nn->writeverf_lock); } static int nfsd_startup_net(struct net *net, const struct cred *cred) @@ -435,6 +459,7 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } +static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -444,18 +469,17 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; @@ -475,10 +499,10 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; @@ -487,8 +511,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); + out: return NOTIFY_DONE; } @@ -505,7 +529,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -513,7 +536,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting @@ -594,20 +616,9 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_shutdown = nfsd_last_thread, .svo_function = nfsd, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, .svo_module = THIS_MODULE, }; -static void nfsd_complete_shutdown(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - WARN_ON(!mutex_is_locked(&nfsd_mutex)); - - nn->nfsd_serv = NULL; - complete(&nn->nfsd_shutdown_complete); -} - void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -622,11 +633,9 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ - serv->sv_ops->svo_setup(serv, NULL, 0); - nfsd_destroy(net); + svc_set_num_threads(serv, NULL, 0); + nfsd_put(net); mutex_unlock(&nfsd_mutex); - /* Wait for shutdown of nfsd_serv to complete */ - wait_for_completion(&nn->nfsd_shutdown_complete); } bool i_am_nfsd(void) @@ -638,6 +647,7 @@ int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) { @@ -647,19 +657,23 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - &nfsd_thread_sv_ops); - if (nn->nfsd_serv == NULL) + serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + &nfsd_thread_sv_ops); + if (serv == NULL) return -ENOMEM; - init_completion(&nn->nfsd_shutdown_complete); - nn->nfsd_serv->sv_maxconn = nn->max_connections; - error = svc_bind(nn->nfsd_serv, net); + serv->sv_maxconn = nn->max_connections; + error = svc_bind(serv, net); if (error < 0) { - svc_destroy(nn->nfsd_serv); - nfsd_complete_shutdown(net); + /* NOT nfsd_put() as notifiers (see below) haven't + * been set up yet. + */ + svc_put(serv); return error; } + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = serv; + spin_unlock(&nfsd_notifier_lock); set_max_drc(); /* check if the notifier is already set */ @@ -669,8 +683,7 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - atomic_inc(&nn->ntf_refcnt); - nfsd_reset_boot_verifier(nn); + nfsd_reset_write_verifier(nn); return 0; } @@ -697,16 +710,26 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } -void nfsd_destroy(struct net *net) +/* This is the callback for kref_put() below. + * There is no code here as the first thing to be done is + * call svc_shutdown_net(), but we cannot get the 'net' from + * the kref. So do all the work when kref_put returns true. + */ +static void nfsd_noop(struct kref *ref) +{ +} + +void nfsd_put(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int destroy = (nn->nfsd_serv->sv_nrthreads == 1); - if (destroy) + if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { svc_shutdown_net(nn->nfsd_serv, net); - svc_destroy(nn->nfsd_serv); - if (destroy) - nfsd_complete_shutdown(net); + svc_destroy(&nn->nfsd_serv->sv_refcnt); + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); + } } int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) @@ -733,7 +756,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { - int new = nthreads[i] * NFSD_MAXSERVS / tot; + int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } @@ -753,12 +776,13 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* apply the new numbers */ svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - &nn->nfsd_serv->sv_pools[i], nthreads[i]); + err = svc_set_num_threads(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], + nthreads[i]); if (err) break; } - nfsd_destroy(net); + nfsd_put(net); return err; } @@ -795,21 +819,19 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = nfsd_startup_net(net, cred); if (error) - goto out_destroy; - error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - NULL, nrservs); + goto out_put; + error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); if (error) goto out_shutdown; - /* We are holding a reference to nn->nfsd_serv which - * we don't want to count in the return value, - * so subtract 1 - */ - error = nn->nfsd_serv->sv_nrthreads - 1; + error = nn->nfsd_serv->sv_nrthreads; out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); -out_destroy: - nfsd_destroy(net); /* Release server */ +out_put: + /* Threads now hold service active */ + if (xchg(&nn->keep_active, 0)) + nfsd_put(net); + nfsd_put(net); out: mutex_unlock(&nfsd_mutex); return error; @@ -923,9 +945,6 @@ nfsd(void *vrqstp) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int err; - /* Lock module and set up kernel thread */ - mutex_lock(&nfsd_mutex); - /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ @@ -945,8 +964,7 @@ nfsd(void *vrqstp) allow_signal(SIGINT); allow_signal(SIGQUIT); - nfsdstats.th_cnt++; - mutex_unlock(&nfsd_mutex); + atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -973,20 +991,36 @@ nfsd(void *vrqstp) /* Clear signals before calling svc_exit_thread() */ flush_signals(current); - mutex_lock(&nfsd_mutex); - nfsdstats.th_cnt --; + atomic_dec(&nfsdstats.th_cnt); out: - rqstp->rq_server = NULL; + /* Take an extra ref so that the svc_put in svc_exit_thread() + * doesn't call svc_destroy() + */ + svc_get(nn->nfsd_serv); /* Release the thread */ svc_exit_thread(rqstp); - nfsd_destroy(net); + /* We need to drop a ref, but may not drop the last reference + * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that + * could deadlock with nfsd_shutdown_threads() waiting for us. + * So three options are: + * - drop a non-final reference, + * - get the mutex without waiting + * - sleep briefly andd try the above again + */ + while (!svc_put_not_last(nn->nfsd_serv)) { + if (mutex_trylock(&nfsd_mutex)) { + nfsd_put(net); + mutex_unlock(&nfsd_mutex); + break; + } + msleep(20); + } /* Release module */ - mutex_unlock(&nfsd_mutex); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -1096,7 +1130,6 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) mutex_unlock(&nfsd_mutex); return -ENODEV; } - /* bump up the psudo refcount while traversing */ svc_get(nn->nfsd_serv); ret = svc_pool_stats_open(nn->nfsd_serv, file); mutex_unlock(&nfsd_mutex); @@ -1109,8 +1142,7 @@ int nfsd_pool_stats_release(struct inode *inode, struct file *file) struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - /* this function really, really should have been called svc_put() */ - nfsd_destroy(net); + nfsd_put(net); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e73bdbb1634a..95457cfd37fc 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -568,6 +568,10 @@ struct nfs4_ol_stateid { struct list_head st_locks; struct nfs4_stateowner *st_stateowner; struct nfs4_clnt_odstate *st_clnt_odstate; +/* + * These bitmasks use 3 separate bits for READ, ALLOW, and BOTH; see the + * comment above bmap_to_share_mode() for explanation: + */ unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; @@ -629,6 +633,7 @@ struct nfsd4_blocked_lock { struct file_lock nbl_lock; struct knfsd_fh nbl_fh; struct nfsd4_callback nbl_cb; + struct kref nbl_kref; }; struct nfsd4_compound_state; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 1d3b881e7382..a8c5a02a84f0 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -45,7 +45,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", nfsdstats.th_cnt); + seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 51ecda852e23..9b43dc3d9991 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -29,11 +29,9 @@ enum { struct nfsd_stats { struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - /* Protected by nfsd_mutex */ - unsigned int th_cnt; /* number of available threads */ + atomic_t th_cnt; /* number of available threads */ }; - extern struct nfsd_stats nfsdstats; extern struct svc_stat nfsd_svcstats; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index f1e0d3c51bc2..5889db66409d 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -47,7 +47,7 @@ rqstp->rq_xprt->xpt_remotelen); \ } while (0); -TRACE_EVENT(nfsd_garbage_args_err, +DECLARE_EVENT_CLASS(nfsd_xdr_err_class, TP_PROTO( const struct svc_rqst *rqstp ), @@ -69,27 +69,13 @@ TRACE_EVENT(nfsd_garbage_args_err, ) ); -TRACE_EVENT(nfsd_cant_encode_err, - TP_PROTO( - const struct svc_rqst *rqstp - ), - TP_ARGS(rqstp), - TP_STRUCT__entry( - NFSD_TRACE_PROC_ARG_FIELDS +#define DEFINE_NFSD_XDR_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \ + TP_PROTO(const struct svc_rqst *rqstp), \ + TP_ARGS(rqstp)) - __field(u32, vers) - __field(u32, proc) - ), - TP_fast_assign( - NFSD_TRACE_PROC_ARG_ASSIGNMENTS - - __entry->vers = rqstp->rq_vers; - __entry->proc = rqstp->rq_proc; - ), - TP_printk("xid=0x%08x vers=%u proc=%u", - __entry->xid, __entry->vers, __entry->proc - ) -); +DEFINE_NFSD_XDR_ERR_EVENT(garbage_args); +DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); #define show_nfsd_may_flags(x) \ __print_flags(x, "|", \ @@ -320,14 +306,14 @@ TRACE_EVENT(nfsd_export_update, DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, - unsigned long len), + u64 offset, + u32 len), TP_ARGS(rqstp, fhp, offset, len), TP_STRUCT__entry( __field(u32, xid) __field(u32, fh_hash) - __field(loff_t, offset) - __field(unsigned long, len) + __field(u64, offset) + __field(u32, len) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); @@ -335,7 +321,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class, __entry->offset = offset; __entry->len = len; ), - TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu", + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u", __entry->xid, __entry->fh_hash, __entry->offset, __entry->len) ) @@ -344,8 +330,8 @@ DECLARE_EVENT_CLASS(nfsd_io_class, DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ TP_PROTO(struct svc_rqst *rqstp, \ struct svc_fh *fhp, \ - loff_t offset, \ - unsigned long len), \ + u64 offset, \ + u32 len), \ TP_ARGS(rqstp, fhp, offset, len)) DEFINE_NFSD_IO_EVENT(read_start); @@ -413,6 +399,56 @@ TRACE_EVENT(nfsd_dirent, ) ) +DECLARE_EVENT_CLASS(nfsd_copy_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *src_fhp, + loff_t src_offset, + struct svc_fh *dst_fhp, + loff_t dst_offset, + u64 count, + int status), + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, count, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, src_fh_hash) + __field(loff_t, src_offset) + __field(u32, dst_fh_hash) + __field(loff_t, dst_offset) + __field(u64, count) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->src_fh_hash = knfsd_fh_hash(&src_fhp->fh_handle); + __entry->src_offset = src_offset; + __entry->dst_fh_hash = knfsd_fh_hash(&dst_fhp->fh_handle); + __entry->dst_offset = dst_offset; + __entry->count = count; + __entry->status = status; + ), + TP_printk("xid=0x%08x src_fh_hash=0x%08x src_offset=%lld " + "dst_fh_hash=0x%08x dst_offset=%lld " + "count=%llu status=%d", + __entry->xid, __entry->src_fh_hash, __entry->src_offset, + __entry->dst_fh_hash, __entry->dst_offset, + (unsigned long long)__entry->count, + __entry->status) +) + +#define DEFINE_NFSD_COPY_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *src_fhp, \ + loff_t src_offset, \ + struct svc_fh *dst_fhp, \ + loff_t dst_offset, \ + u64 count, \ + int status), \ + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, \ + count, status)) + +DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err); + #include "state.h" #include "filecache.h" #include "vfs.h" @@ -538,6 +574,34 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ DEFINE_NET_EVENT(grace_start); DEFINE_NET_EVENT(grace_complete); +TRACE_EVENT(nfsd_writeverf_reset, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_rqst *rqstp, + int error + ), + TP_ARGS(nn, rqstp, error), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(u32, xid) + __field(int, error) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->error = error; + + /* avoid seqlock inside TP_fast_assign */ + memcpy(__entry->verifier, nn->writeverf, + NFS4_VERIFIER_SIZE); + ), + TP_printk("boot_time=%16llx xid=0x%08x error=%d new verifier=0x%s", + __entry->boot_time, __entry->xid, __entry->error, + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE) + ) +); + TRACE_EVENT(nfsd_clid_cred_mismatch, TP_PROTO( const struct nfs4_client *clp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c99857689e2c..91600e71be19 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -40,6 +40,7 @@ #include "../internal.h" #include "acl.h" #include "idmap.h" +#include "xdr4.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" @@ -434,6 +435,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, .ia_size = iap->ia_size, }; + host_err = -EFBIG; + if (iap->ia_size < 0) + goto out_unlock; + host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); if (host_err) goto out_unlock; @@ -517,15 +522,23 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif -__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, - struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync) +static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp) +{ + return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate; +} + +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, + u64 count, bool sync) { struct file *src = nf_src->nf_file; struct file *dst = nf_dst->nf_file; + errseq_t since; loff_t cloned; __be32 ret = 0; - down_write(&nf_dst->nf_rwsem); + since = READ_ONCE(dst->f_wb_err); cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); if (cloned < 0) { ret = nfserrno(cloned); @@ -540,15 +553,25 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); + if (!status) status = commit_inode_metadata(file_inode(src)); if (status < 0) { - nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net, - nfsd_net_id)); + struct nfsd_net *nn = net_generic(nf_dst->nf_net, + nfsd_net_id); + + trace_nfsd_clone_file_range_err(rqstp, + &nfsd4_get_cstate(rqstp)->save_fh, + src_pos, + &nfsd4_get_cstate(rqstp)->current_fh, + dst_pos, + count, status); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, status); ret = nfserrno(status); } } out_err: - up_write(&nf_dst->nf_rwsem); return ret; } @@ -777,6 +800,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { __be32 err; + bool retried = false; validate_process_creds(); /* @@ -792,9 +816,16 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, */ if (type == S_IFREG) may_flags |= NFSD_MAY_OWNER_OVERRIDE; +retry: err = fh_verify(rqstp, fhp, type, may_flags); - if (!err) + if (!err) { err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + if (err == nfserr_stale && !retried) { + retried = true; + fh_put(fhp); + goto retry; + } + } validate_process_creds(); return err; } @@ -944,10 +975,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, unsigned long *cnt, int stable, __be32 *verf) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; struct super_block *sb = file_inode(file)->i_sb; struct svc_export *exp; struct iov_iter iter; + errseq_t since; __be32 nfserr; int host_err; int use_wgather; @@ -985,36 +1018,28 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, flags |= RWF_SYNC; iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); - if (flags & RWF_SYNC) { - down_write(&nf->nf_rwsem); - host_err = vfs_iter_write(file, &iter, &pos, flags); - if (host_err < 0) - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); - up_write(&nf->nf_rwsem); - } else { - down_read(&nf->nf_rwsem); - if (verf) - nfsd_copy_boot_verifier(verf, - net_generic(SVC_NET(rqstp), - nfsd_net_id)); - host_err = vfs_iter_write(file, &iter, &pos, flags); - up_read(&nf->nf_rwsem); - } + since = READ_ONCE(file->f_wb_err); + if (verf) + nfsd_copy_write_verifier(verf, nn); + host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) { - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, host_err); goto out_nfserr; } *cnt = host_err; nfsd_stats_io_write_add(exp, *cnt); fsnotify_modify(file); + host_err = filemap_check_wb_err(file->f_mapping, since); + if (host_err < 0) + goto out_nfserr; if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); - if (host_err < 0) - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); + if (host_err < 0) { + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, host_err); + } } out_nfserr: @@ -1089,71 +1114,77 @@ out: } #ifdef CONFIG_NFSD_V3 -static int -nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset, - loff_t end) -{ - struct address_space *mapping = nf->nf_file->f_mapping; - int ret = filemap_fdatawrite_range(mapping, offset, end); - - if (ret) - return ret; - filemap_fdatawait_range_keep_errors(mapping, offset, end); - return 0; -} - -/* - * Commit all pending writes to stable storage. +/** + * nfsd_commit - Commit pending writes to stable storage + * @rqstp: RPC request being processed + * @fhp: NFS filehandle + * @offset: raw offset from beginning of file + * @count: raw count of bytes to sync + * @verf: filled in with the server's current write verifier * - * Note: we only guarantee that data that lies within the range specified - * by the 'offset' and 'count' parameters will be synced. + * Note: we guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. The server + * is permitted to sync data that lies outside this range at the + * same time. * * Unfortunately we cannot lock the file to make sure we return full WCC * data to the client, as locking happens lower down in the filesystem. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 -nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, unsigned long count, __be32 *verf) +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, + u32 count, __be32 *verf) { + u64 maxbytes; + loff_t start, end; + struct nfsd_net *nn; struct nfsd_file *nf; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; - - if (offset < 0) - goto out; - if (count != 0) { - end = offset + (loff_t)count - 1; - if (end < offset) - goto out; - } + __be32 err; err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; + + /* + * Convert the client-provided (offset, count) range to a + * (start, end) range. If the client-provided range falls + * outside the maximum file size of the underlying FS, + * clamp the sync range appropriately. + */ + start = 0; + end = LLONG_MAX; + maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes; + if (offset < maxbytes) { + start = offset; + if (count && (offset + count - 1 < maxbytes)) + end = offset + count - 1; + } + + nn = net_generic(nf->nf_net, nfsd_net_id); if (EX_ISSYNC(fhp->fh_export)) { - int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end); + errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); + int err2; - down_write(&nf->nf_rwsem); - if (!err2) - err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + err2 = vfs_fsync_range(nf->nf_file, start, end, 0); switch (err2) { case 0: - nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_copy_write_verifier(verf, nn); + err2 = filemap_check_wb_err(nf->nf_file->f_mapping, + since); break; case -EINVAL: err = nfserr_notsupp; break; default: - err = nfserrno(err2); - nfsd_reset_boot_verifier(net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, err2); } - up_write(&nf->nf_rwsem); + err = nfserrno(err2); } else - nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_copy_write_verifier(verf, nn); nfsd_file_put(nf); out: @@ -1747,8 +1778,8 @@ retry: * so do it by hand */ trap = lock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = true; - fill_pre_wcc(ffhp); - fill_pre_wcc(tfhp); + fh_fill_pre_attrs(ffhp); + fh_fill_pre_attrs(tfhp); odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1808,8 +1839,8 @@ retry: * were the same, so again we do it by hand. */ if (!close_cached) { - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + fh_fill_post_attrs(ffhp); + fh_fill_post_attrs(tfhp); } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index b21b76e6b9a8..2c43d10e3cab 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -57,7 +57,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); -__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync); #endif /* CONFIG_NFSD_V4 */ @@ -73,8 +74,8 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, u32 *verifier, bool *truncp, bool *created); -__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, - loff_t, unsigned long, __be32 *verf); +__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, + u64 offset, u32 count, __be32 *verf); #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 __be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index bc3e2cd4117f..063dd16d75b5 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -195,12 +195,12 @@ void nilfs_page_bug(struct page *page) */ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) { - struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + struct buffer_head *dbh, *dbufs, *sbh; unsigned long mask = NILFS_BUFFER_INHERENT_BITS; BUG_ON(PageWriteback(dst)); - sbh = sbufs = page_buffers(src); + sbh = page_buffers(src); if (!page_has_buffers(dst)) create_empty_buffers(dst, sbh->b_size, 0); diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 81f35c5b5a40..379d22e28ed6 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -57,7 +57,7 @@ static void nilfs_##name##_attr_release(struct kobject *kobj) \ complete(&subgroups->sg_##name##_kobj_unregister); \ } \ static struct kobj_type nilfs_##name##_ktype = { \ - .default_attrs = nilfs_##name##_attrs, \ + .default_groups = nilfs_##name##_groups, \ .sysfs_ops = &nilfs_##name##_attr_ops, \ .release = nilfs_##name##_attr_release, \ } @@ -129,6 +129,7 @@ static struct attribute *nilfs_snapshot_attrs[] = { NILFS_SNAPSHOT_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_snapshot); static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -166,7 +167,7 @@ static const struct sysfs_ops nilfs_snapshot_attr_ops = { }; static struct kobj_type nilfs_snapshot_ktype = { - .default_attrs = nilfs_snapshot_attrs, + .default_groups = nilfs_snapshot_groups, .sysfs_ops = &nilfs_snapshot_attr_ops, .release = nilfs_snapshot_attr_release, }; @@ -226,6 +227,7 @@ static struct attribute *nilfs_mounted_snapshots_attrs[] = { NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_mounted_snapshots); NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev); NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev); @@ -339,6 +341,7 @@ static struct attribute *nilfs_checkpoints_attrs[] = { NILFS_CHECKPOINTS_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_checkpoints); NILFS_DEV_INT_GROUP_OPS(checkpoints, dev); NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev); @@ -428,6 +431,7 @@ static struct attribute *nilfs_segments_attrs[] = { NILFS_SEGMENTS_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_segments); NILFS_DEV_INT_GROUP_OPS(segments, dev); NILFS_DEV_INT_GROUP_TYPE(segments, dev); @@ -689,6 +693,7 @@ static struct attribute *nilfs_segctor_attrs[] = { NILFS_SEGCTOR_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_segctor); NILFS_DEV_INT_GROUP_OPS(segctor, dev); NILFS_DEV_INT_GROUP_TYPE(segctor, dev); @@ -816,6 +821,7 @@ static struct attribute *nilfs_superblock_attrs[] = { NILFS_SUPERBLOCK_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_superblock); NILFS_DEV_INT_GROUP_OPS(superblock, dev); NILFS_DEV_INT_GROUP_TYPE(superblock, dev); @@ -924,6 +930,7 @@ static struct attribute *nilfs_dev_attrs[] = { NILFS_DEV_ATTR_LIST(README), NULL, }; +ATTRIBUTE_GROUPS(nilfs_dev); static ssize_t nilfs_dev_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -961,7 +968,7 @@ static const struct sysfs_ops nilfs_dev_attr_ops = { }; static struct kobj_type nilfs_dev_ktype = { - .default_attrs = nilfs_dev_attrs, + .default_groups = nilfs_dev_groups, .sysfs_ops = &nilfs_dev_attr_ops, .release = nilfs_dev_attr_release, }; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e85e13c50d6d..829dd4a61b66 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -19,7 +19,25 @@ #include <linux/fdtable.h> #include <linux/fsnotify_backend.h> -int dir_notify_enable __read_mostly = 1; +static int dir_notify_enable __read_mostly = 1; +#ifdef CONFIG_SYSCTL +static struct ctl_table dnotify_sysctls[] = { + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +static void __init dnotify_sysctl_init(void) +{ + register_sysctl_init("fs", dnotify_sysctls); +} +#else +#define dnotify_sysctl_init() do { } while (0) +#endif static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; @@ -196,7 +214,7 @@ static __u32 convert_arg(unsigned long arg) if (arg & DN_ATTRIB) new_mask |= FS_ATTRIB; if (arg & DN_RENAME) - new_mask |= FS_DN_RENAME; + new_mask |= FS_RENAME; if (arg & DN_CREATE) new_mask |= (FS_CREATE | FS_MOVED_TO); @@ -386,6 +404,7 @@ static int __init dnotify_init(void) dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); + dnotify_sysctl_init(); return 0; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b6091775aa6e..985e995d2a39 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -76,8 +76,10 @@ static bool fanotify_info_equal(struct fanotify_info *info1, struct fanotify_info *info2) { if (info1->dir_fh_totlen != info2->dir_fh_totlen || + info1->dir2_fh_totlen != info2->dir2_fh_totlen || info1->file_fh_totlen != info2->file_fh_totlen || - info1->name_len != info2->name_len) + info1->name_len != info2->name_len || + info1->name2_len != info2->name2_len) return false; if (info1->dir_fh_totlen && @@ -85,14 +87,24 @@ static bool fanotify_info_equal(struct fanotify_info *info1, fanotify_info_dir_fh(info2))) return false; + if (info1->dir2_fh_totlen && + !fanotify_fh_equal(fanotify_info_dir2_fh(info1), + fanotify_info_dir2_fh(info2))) + return false; + if (info1->file_fh_totlen && !fanotify_fh_equal(fanotify_info_file_fh(info1), fanotify_info_file_fh(info2))) return false; - return !info1->name_len || - !memcmp(fanotify_info_name(info1), fanotify_info_name(info2), - info1->name_len); + if (info1->name_len && + memcmp(fanotify_info_name(info1), fanotify_info_name(info2), + info1->name_len)) + return false; + + return !info1->name2_len || + !memcmp(fanotify_info_name2(info1), fanotify_info_name2(info2), + info1->name2_len); } static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, @@ -141,6 +153,13 @@ static bool fanotify_should_merge(struct fanotify_event *old, if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR)) return false; + /* + * FAN_RENAME event is reported with special info record types, + * so we cannot merge it with other events. + */ + if ((old->mask & FAN_RENAME) != (new->mask & FAN_RENAME)) + return false; + switch (old->type) { case FANOTIFY_EVENT_TYPE_PATH: return fanotify_path_equal(fanotify_event_path(old), @@ -272,8 +291,9 @@ out: */ static u32 fanotify_group_event_mask(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info, - u32 event_mask, const void *data, - int data_type, struct inode *dir) + u32 *match_mask, u32 event_mask, + const void *data, int data_type, + struct inode *dir) { __u32 marks_mask = 0, marks_ignored_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | @@ -299,7 +319,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) continue; mark = iter_info->marks[type]; @@ -318,11 +338,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, * If the event is on a child and this mark is on a parent not * watching children, don't send it! */ - if (type == FSNOTIFY_OBJ_TYPE_PARENT && + if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD)) continue; marks_mask |= mark->mask; + + /* Record the mark types of this group that matched the event */ + *match_mask |= 1U << type; } test_mask = event_mask & marks_mask & ~marks_ignored_mask; @@ -411,7 +434,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, * be zero in that case if encoding fh len failed. */ err = -ENOENT; - if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4)) + if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4) || fh_len > MAX_HANDLE_SZ) goto out_err; /* No external buffer in a variable size allocated fh */ @@ -458,17 +481,41 @@ out_err: } /* - * The inode to use as identifier when reporting fid depends on the event. - * Report the modified directory inode on dirent modification events. - * Report the "victim" inode otherwise. + * FAN_REPORT_FID is ambiguous in that it reports the fid of the child for + * some events and the fid of the parent for create/delete/move events. + * + * With the FAN_REPORT_TARGET_FID flag, the fid of the child is reported + * also in create/delete/move events in addition to the fid of the parent + * and the name of the child. + */ +static inline bool fanotify_report_child_fid(unsigned int fid_mode, u32 mask) +{ + if (mask & ALL_FSNOTIFY_DIRENT_EVENTS) + return (fid_mode & FAN_REPORT_TARGET_FID); + + return (fid_mode & FAN_REPORT_FID) && !(mask & FAN_ONDIR); +} + +/* + * The inode to use as identifier when reporting fid depends on the event + * and the group flags. + * + * With the group flag FAN_REPORT_TARGET_FID, always report the child fid. + * + * Without the group flag FAN_REPORT_TARGET_FID, report the modified directory + * fid on dirent events and the child fid otherwise. + * * For example: - * FS_ATTRIB reports the child inode even if reported on a watched parent. - * FS_CREATE reports the modified dir inode and not the created inode. + * FS_ATTRIB reports the child fid even if reported on a watched parent. + * FS_CREATE reports the modified dir fid without FAN_REPORT_TARGET_FID. + * and reports the created child fid with FAN_REPORT_TARGET_FID. */ static struct inode *fanotify_fid_inode(u32 event_mask, const void *data, - int data_type, struct inode *dir) + int data_type, struct inode *dir, + unsigned int fid_mode) { - if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) + if ((event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) && + !(fid_mode & FAN_REPORT_TARGET_FID)) return dir; return fsnotify_data_inode(data, data_type); @@ -552,25 +599,34 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, return &ffe->fae; } -static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, +static struct fanotify_event *fanotify_alloc_name_event(struct inode *dir, __kernel_fsid_t *fsid, const struct qstr *name, struct inode *child, + struct dentry *moved, unsigned int *hash, gfp_t gfp) { struct fanotify_name_event *fne; struct fanotify_info *info; struct fanotify_fh *dfh, *ffh; - unsigned int dir_fh_len = fanotify_encode_fh_len(id); + struct inode *dir2 = moved ? d_inode(moved->d_parent) : NULL; + const struct qstr *name2 = moved ? &moved->d_name : NULL; + unsigned int dir_fh_len = fanotify_encode_fh_len(dir); + unsigned int dir2_fh_len = fanotify_encode_fh_len(dir2); unsigned int child_fh_len = fanotify_encode_fh_len(child); - unsigned int size; - - size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len; + unsigned long name_len = name ? name->len : 0; + unsigned long name2_len = name2 ? name2->len : 0; + unsigned int len, size; + + /* Reserve terminating null byte even for empty name */ + size = sizeof(*fne) + name_len + name2_len + 2; + if (dir_fh_len) + size += FANOTIFY_FH_HDR_LEN + dir_fh_len; + if (dir2_fh_len) + size += FANOTIFY_FH_HDR_LEN + dir2_fh_len; if (child_fh_len) size += FANOTIFY_FH_HDR_LEN + child_fh_len; - if (name) - size += name->len + 1; fne = kmalloc(size, gfp); if (!fne) return NULL; @@ -580,24 +636,41 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, *hash ^= fanotify_hash_fsid(fsid); info = &fne->info; fanotify_info_init(info); - dfh = fanotify_info_dir_fh(info); - info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0); + if (dir_fh_len) { + dfh = fanotify_info_dir_fh(info); + len = fanotify_encode_fh(dfh, dir, dir_fh_len, hash, 0); + fanotify_info_set_dir_fh(info, len); + } + if (dir2_fh_len) { + dfh = fanotify_info_dir2_fh(info); + len = fanotify_encode_fh(dfh, dir2, dir2_fh_len, hash, 0); + fanotify_info_set_dir2_fh(info, len); + } if (child_fh_len) { ffh = fanotify_info_file_fh(info); - info->file_fh_totlen = fanotify_encode_fh(ffh, child, - child_fh_len, hash, 0); + len = fanotify_encode_fh(ffh, child, child_fh_len, hash, 0); + fanotify_info_set_file_fh(info, len); } - if (name) { - long salt = name->len; - + if (name_len) { fanotify_info_copy_name(info, name); - *hash ^= full_name_hash((void *)salt, name->name, name->len); + *hash ^= full_name_hash((void *)name_len, name->name, name_len); + } + if (name2_len) { + fanotify_info_copy_name2(info, name2); + *hash ^= full_name_hash((void *)name2_len, name2->name, + name2_len); } - pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", - __func__, id->i_ino, size, dir_fh_len, child_fh_len, + pr_debug("%s: size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", + __func__, size, dir_fh_len, child_fh_len, info->name_len, info->name_len, fanotify_info_name(info)); + if (dir2_fh_len) { + pr_debug("%s: dir2_fh_len=%u name2_len=%u name2='%.*s'\n", + __func__, dir2_fh_len, info->name2_len, + info->name2_len, fanotify_info_name2(info)); + } + return &fne->fae; } @@ -639,19 +712,21 @@ static struct fanotify_event *fanotify_alloc_error_event( return &fee->fae; } -static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, - u32 mask, const void *data, - int data_type, struct inode *dir, - const struct qstr *file_name, - __kernel_fsid_t *fsid) +static struct fanotify_event *fanotify_alloc_event( + struct fsnotify_group *group, + u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, + __kernel_fsid_t *fsid, u32 match_mask) { struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; - struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); + unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + struct inode *id = fanotify_fid_inode(mask, data, data_type, dir, + fid_mode); struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct mem_cgroup *old_memcg; + struct dentry *moved = NULL; struct inode *child = NULL; bool name_event = false; unsigned int hash = 0; @@ -660,11 +735,10 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { /* - * With both flags FAN_REPORT_DIR_FID and FAN_REPORT_FID, we - * report the child fid for events reported on a non-dir child + * For certain events and group flags, report the child fid * in addition to reporting the parent fid and maybe child name. */ - if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir) + if (fanotify_report_child_fid(fid_mode, mask) && id != dirid) child = id; id = dirid; @@ -688,6 +762,38 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) { name_event = true; } + + /* + * In the special case of FAN_RENAME event, use the match_mask + * to determine if we need to report only the old parent+name, + * only the new parent+name or both. + * 'dirid' and 'file_name' are the old parent+name and + * 'moved' has the new parent+name. + */ + if (mask & FAN_RENAME) { + bool report_old, report_new; + + if (WARN_ON_ONCE(!match_mask)) + return NULL; + + /* Report both old and new parent+name if sb watching */ + report_old = report_new = + match_mask & (1U << FSNOTIFY_ITER_TYPE_SB); + report_old |= + match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE); + report_new |= + match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE2); + + if (!report_old) { + /* Do not report old parent+name */ + dirid = NULL; + file_name = NULL; + } + if (report_new) { + /* Report new parent+name */ + moved = fsnotify_data_dentry(data, data_type); + } + } } /* @@ -709,9 +815,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, } else if (fanotify_is_error_event(mask)) { event = fanotify_alloc_error_event(group, fsid, data, data_type, &hash); - } else if (name_event && (file_name || child)) { - event = fanotify_alloc_name_event(id, fsid, file_name, child, - &hash, gfp); + } else if (name_event && (file_name || moved || child)) { + event = fanotify_alloc_name_event(dirid, fsid, file_name, child, + moved, &hash, gfp); } else if (fid_mode) { event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); } else { @@ -746,7 +852,7 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) int type; __kernel_fsid_t fsid = {}; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { struct fsnotify_mark_connector *conn; if (!fsnotify_iter_should_report_type(iter_info, type)) @@ -800,6 +906,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, struct fanotify_event *event; struct fsnotify_event *fsn_event; __kernel_fsid_t fsid = {}; + u32 match_mask = 0; BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); @@ -821,15 +928,17 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); + BUILD_BUG_ON(FAN_RENAME != FS_RENAME); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21); - mask = fanotify_group_event_mask(group, iter_info, mask, data, - data_type, dir); + mask = fanotify_group_event_mask(group, iter_info, &match_mask, + mask, data, data_type, dir); if (!mask) return 0; - pr_debug("%s: group=%p mask=%x\n", __func__, group, mask); + pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__, + group, mask, match_mask); if (fanotify_is_perm_event(mask)) { /* @@ -848,7 +957,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } event = fanotify_alloc_event(group, mask, data, data_type, dir, - file_name, &fsid); + file_name, &fsid, match_mask); ret = -ENOMEM; if (unlikely(!event)) { /* diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index d25f500bf7e7..a3d5b751cac5 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -40,15 +40,45 @@ struct fanotify_fh { struct fanotify_info { /* size of dir_fh/file_fh including fanotify_fh hdr size */ u8 dir_fh_totlen; + u8 dir2_fh_totlen; u8 file_fh_totlen; u8 name_len; - u8 pad; + u8 name2_len; + u8 pad[3]; unsigned char buf[]; /* * (struct fanotify_fh) dir_fh starts at buf[0] - * (optional) file_fh starts at buf[dir_fh_totlen] - * name starts at buf[dir_fh_totlen + file_fh_totlen] + * (optional) dir2_fh starts at buf[dir_fh_totlen] + * (optional) file_fh starts at buf[dir_fh_totlen + dir2_fh_totlen] + * name starts at buf[dir_fh_totlen + dir2_fh_totlen + file_fh_totlen] + * ... */ +#define FANOTIFY_DIR_FH_SIZE(info) ((info)->dir_fh_totlen) +#define FANOTIFY_DIR2_FH_SIZE(info) ((info)->dir2_fh_totlen) +#define FANOTIFY_FILE_FH_SIZE(info) ((info)->file_fh_totlen) +#define FANOTIFY_NAME_SIZE(info) ((info)->name_len + 1) +#define FANOTIFY_NAME2_SIZE(info) ((info)->name2_len + 1) + +#define FANOTIFY_DIR_FH_OFFSET(info) 0 +#define FANOTIFY_DIR2_FH_OFFSET(info) \ + (FANOTIFY_DIR_FH_OFFSET(info) + FANOTIFY_DIR_FH_SIZE(info)) +#define FANOTIFY_FILE_FH_OFFSET(info) \ + (FANOTIFY_DIR2_FH_OFFSET(info) + FANOTIFY_DIR2_FH_SIZE(info)) +#define FANOTIFY_NAME_OFFSET(info) \ + (FANOTIFY_FILE_FH_OFFSET(info) + FANOTIFY_FILE_FH_SIZE(info)) +#define FANOTIFY_NAME2_OFFSET(info) \ + (FANOTIFY_NAME_OFFSET(info) + FANOTIFY_NAME_SIZE(info)) + +#define FANOTIFY_DIR_FH_BUF(info) \ + ((info)->buf + FANOTIFY_DIR_FH_OFFSET(info)) +#define FANOTIFY_DIR2_FH_BUF(info) \ + ((info)->buf + FANOTIFY_DIR2_FH_OFFSET(info)) +#define FANOTIFY_FILE_FH_BUF(info) \ + ((info)->buf + FANOTIFY_FILE_FH_OFFSET(info)) +#define FANOTIFY_NAME_BUF(info) \ + ((info)->buf + FANOTIFY_NAME_OFFSET(info)) +#define FANOTIFY_NAME2_BUF(info) \ + ((info)->buf + FANOTIFY_NAME2_OFFSET(info)) } __aligned(4); static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh) @@ -87,7 +117,21 @@ static inline struct fanotify_fh *fanotify_info_dir_fh(struct fanotify_info *inf { BUILD_BUG_ON(offsetof(struct fanotify_info, buf) % 4); - return (struct fanotify_fh *)info->buf; + return (struct fanotify_fh *)FANOTIFY_DIR_FH_BUF(info); +} + +static inline int fanotify_info_dir2_fh_len(struct fanotify_info *info) +{ + if (!info->dir2_fh_totlen || + WARN_ON_ONCE(info->dir2_fh_totlen < FANOTIFY_FH_HDR_LEN)) + return 0; + + return info->dir2_fh_totlen - FANOTIFY_FH_HDR_LEN; +} + +static inline struct fanotify_fh *fanotify_info_dir2_fh(struct fanotify_info *info) +{ + return (struct fanotify_fh *)FANOTIFY_DIR2_FH_BUF(info); } static inline int fanotify_info_file_fh_len(struct fanotify_info *info) @@ -101,32 +145,90 @@ static inline int fanotify_info_file_fh_len(struct fanotify_info *info) static inline struct fanotify_fh *fanotify_info_file_fh(struct fanotify_info *info) { - return (struct fanotify_fh *)(info->buf + info->dir_fh_totlen); + return (struct fanotify_fh *)FANOTIFY_FILE_FH_BUF(info); } -static inline const char *fanotify_info_name(struct fanotify_info *info) +static inline char *fanotify_info_name(struct fanotify_info *info) { - return info->buf + info->dir_fh_totlen + info->file_fh_totlen; + if (!info->name_len) + return NULL; + + return FANOTIFY_NAME_BUF(info); +} + +static inline char *fanotify_info_name2(struct fanotify_info *info) +{ + if (!info->name2_len) + return NULL; + + return FANOTIFY_NAME2_BUF(info); } static inline void fanotify_info_init(struct fanotify_info *info) { + BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN + MAX_HANDLE_SZ > U8_MAX); + BUILD_BUG_ON(NAME_MAX > U8_MAX); + info->dir_fh_totlen = 0; + info->dir2_fh_totlen = 0; info->file_fh_totlen = 0; info->name_len = 0; + info->name2_len = 0; +} + +/* These set/copy helpers MUST be called by order */ +static inline void fanotify_info_set_dir_fh(struct fanotify_info *info, + unsigned int totlen) +{ + if (WARN_ON_ONCE(info->dir2_fh_totlen > 0) || + WARN_ON_ONCE(info->file_fh_totlen > 0) || + WARN_ON_ONCE(info->name_len > 0) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + + info->dir_fh_totlen = totlen; } -static inline unsigned int fanotify_info_len(struct fanotify_info *info) +static inline void fanotify_info_set_dir2_fh(struct fanotify_info *info, + unsigned int totlen) { - return info->dir_fh_totlen + info->file_fh_totlen + info->name_len; + if (WARN_ON_ONCE(info->file_fh_totlen > 0) || + WARN_ON_ONCE(info->name_len > 0) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + + info->dir2_fh_totlen = totlen; +} + +static inline void fanotify_info_set_file_fh(struct fanotify_info *info, + unsigned int totlen) +{ + if (WARN_ON_ONCE(info->name_len > 0) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + + info->file_fh_totlen = totlen; } static inline void fanotify_info_copy_name(struct fanotify_info *info, const struct qstr *name) { + if (WARN_ON_ONCE(name->len > NAME_MAX) || + WARN_ON_ONCE(info->name2_len > 0)) + return; + info->name_len = name->len; - strcpy(info->buf + info->dir_fh_totlen + info->file_fh_totlen, - name->name); + strcpy(fanotify_info_name(info), name->name); +} + +static inline void fanotify_info_copy_name2(struct fanotify_info *info, + const struct qstr *name) +{ + if (WARN_ON_ONCE(name->len > NAME_MAX)) + return; + + info->name2_len = name->len; + strcpy(fanotify_info_name2(info), name->name); } /* @@ -271,6 +373,13 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) return info ? fanotify_info_dir_fh_len(info) : 0; } +static inline int fanotify_event_dir2_fh_len(struct fanotify_event *event) +{ + struct fanotify_info *info = fanotify_event_info(event); + + return info ? fanotify_info_dir2_fh_len(info) : 0; +} + static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) { /* For error events, even zeroed fh are reported. */ @@ -284,6 +393,17 @@ static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event) return fanotify_event_dir_fh_len(event) > 0; } +static inline bool fanotify_event_has_dir2_fh(struct fanotify_event *event) +{ + return fanotify_event_dir2_fh_len(event) > 0; +} + +static inline bool fanotify_event_has_any_dir_fh(struct fanotify_event *event) +{ + return fanotify_event_has_dir_fh(event) || + fanotify_event_has_dir2_fh(event); +} + struct fanotify_path_event { struct fanotify_event fae; struct path path; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 559bc1e9926d..2ff6bd85ba8f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -59,7 +59,7 @@ static int fanotify_max_queued_events __read_mostly; static long ft_zero = 0; static long ft_int_max = INT_MAX; -struct ctl_table fanotify_table[] = { +static struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], @@ -88,6 +88,13 @@ struct ctl_table fanotify_table[] = { }, { } }; + +static void __init fanotify_sysctls_init(void) +{ + register_sysctl("fs/fanotify", fanotify_table); +} +#else +#define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ /* @@ -129,12 +136,28 @@ static int fanotify_fid_info_len(int fh_len, int name_len) FANOTIFY_EVENT_ALIGN); } +/* FAN_RENAME may have one or two dir+name info records */ +static int fanotify_dir_name_info_len(struct fanotify_event *event) +{ + struct fanotify_info *info = fanotify_event_info(event); + int dir_fh_len = fanotify_event_dir_fh_len(event); + int dir2_fh_len = fanotify_event_dir2_fh_len(event); + int info_len = 0; + + if (dir_fh_len) + info_len += fanotify_fid_info_len(dir_fh_len, + info->name_len); + if (dir2_fh_len) + info_len += fanotify_fid_info_len(dir2_fh_len, + info->name2_len); + + return info_len; +} + static size_t fanotify_event_len(unsigned int info_mode, struct fanotify_event *event) { size_t event_len = FAN_EVENT_METADATA_LEN; - struct fanotify_info *info; - int dir_fh_len; int fh_len; int dot_len = 0; @@ -144,11 +167,8 @@ static size_t fanotify_event_len(unsigned int info_mode, if (fanotify_is_error_event(event->mask)) event_len += FANOTIFY_ERROR_INFO_LEN; - info = fanotify_event_info(event); - - if (fanotify_event_has_dir_fh(event)) { - dir_fh_len = fanotify_event_dir_fh_len(event); - event_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + if (fanotify_event_has_any_dir_fh(event)) { + event_len += fanotify_dir_name_info_len(event); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* @@ -332,11 +352,10 @@ static int process_access_response(struct fsnotify_group *group, static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { - struct fanotify_event_info_error info; + struct fanotify_event_info_error info = { }; struct fanotify_error_event *fee = FANOTIFY_EE(event); info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; - info.hdr.pad = 0; info.hdr.len = FANOTIFY_ERROR_INFO_LEN; if (WARN_ON(count < info.hdr.len)) @@ -380,6 +399,8 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return -EFAULT; break; case FAN_EVENT_INFO_TYPE_DFID_NAME: + case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: + case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: if (WARN_ON_ONCE(!name || !name_len)) return -EFAULT; break; @@ -479,11 +500,19 @@ static int copy_info_records_to_user(struct fanotify_event *event, unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; /* - * Event info records order is as follows: dir fid + name, child fid. + * Event info records order is as follows: + * 1. dir fid + name + * 2. (optional) new dir fid + new name + * 3. (optional) child fid */ if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; + + /* FAN_RENAME uses special info types */ + if (event->mask & FAN_RENAME) + info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_info_dir_fh(info), info_type, @@ -497,6 +526,22 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + /* New dir fid+name may be reported in addition to old dir fid+name */ + if (fanotify_event_has_dir2_fh(event)) { + info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir2_fh(info), + info_type, + fanotify_info_name2(info), + info->name2_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; @@ -656,9 +701,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->fd = fd; - if (f) - fd_install(fd, f); - if (info_mode) { ret = copy_info_records_to_user(event, info, info_mode, pidfd, buf, count); @@ -666,6 +708,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, goto out_close_fd; } + if (f) + fd_install(fd, f); + return metadata.event_len; out_close_fd: @@ -1057,7 +1102,7 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, - unsigned int type, + unsigned int obj_type, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; @@ -1080,7 +1125,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, } fsnotify_init_mark(mark, group); - ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); if (ret) { fsnotify_put_mark(mark); goto out_dec_ucounts; @@ -1105,7 +1150,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) } static int fanotify_add_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int flags, __kernel_fsid_t *fsid) { @@ -1116,7 +1161,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, type, fsid); + fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid); if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); return PTR_ERR(fsn_mark); @@ -1275,6 +1320,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL; + /* + * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID + * and is used as an indication to report both dir and child fid on all + * dirent events. + */ + if ((fid_mode & FAN_REPORT_TARGET_FID) && + (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) + return -EINVAL; + f_flags = O_RDWR | FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; @@ -1536,6 +1590,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, (!fid_mode || mark_type == FAN_MARK_MOUNT)) goto fput_and_out; + /* + * FAN_RENAME uses special info type records to report the old and + * new parent+name. Reporting only old and new parent id is less + * useful and was not implemented. + */ + if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) + goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) @@ -1667,7 +1729,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, @@ -1685,6 +1747,7 @@ static int __init fanotify_user_setup(void) init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = FANOTIFY_DEFAULT_MAX_GROUPS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + fanotify_sysctls_init(); return 0; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4034ca566f95..ab81a0776ece 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -279,6 +279,18 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; + /* + * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. + * The only ->handle_inode_event() backend that supports FS_RENAME is + * dnotify, where it means file was renamed within same parent. + */ + if (mask & FS_RENAME) { + struct dentry *moved = fsnotify_data_dentry(data, data_type); + + if (dir != moved->d_parent->d_inode) + return 0; + } + if (parent_mark) { /* * parent_mark indicates that the parent inode is watching @@ -330,7 +342,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, /* clear ignored on inode modification */ if (mask & FS_MODIFY) { - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) continue; mark = iter_info->marks[type]; @@ -340,7 +352,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, } } - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) continue; mark = iter_info->marks[type]; @@ -405,7 +417,7 @@ static unsigned int fsnotify_iter_select_report_types( int type; /* Choose max prio group among groups of all queue heads */ - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) @@ -417,7 +429,7 @@ static unsigned int fsnotify_iter_select_report_types( /* Set the report mask for marks from same group as max prio group */ iter_info->report_mask = 0; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) == 0) @@ -435,7 +447,7 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { int type; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { if (fsnotify_iter_should_report_type(iter_info, type)) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); @@ -469,7 +481,9 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; - struct inode *parent = NULL; + struct inode *inode2 = NULL; + struct dentry *moved; + int inode2_type; int ret = 0; __u32 test_mask, marks_mask; @@ -479,12 +493,19 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; + /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ + if (mask & FS_RENAME) { + moved = fsnotify_data_dentry(data, data_type); + inode2 = moved->d_parent->d_inode; + inode2_type = FSNOTIFY_ITER_TYPE_INODE2; + } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ - parent = dir; + inode2 = dir; + inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* @@ -497,7 +518,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if (!sb->s_fsnotify_marks && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!parent || !parent->i_fsnotify_marks)) + (!inode2 || !inode2->i_fsnotify_marks)) return 0; marks_mask = sb->s_fsnotify_mask; @@ -505,8 +526,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, marks_mask |= mnt->mnt_fsnotify_mask; if (inode) marks_mask |= inode->i_fsnotify_mask; - if (parent) - marks_mask |= parent->i_fsnotify_mask; + if (inode2) + marks_mask |= inode2->i_fsnotify_mask; /* @@ -519,19 +540,19 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sb->s_fsnotify_marks); if (mnt) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = + iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = + iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } - if (parent) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] = - fsnotify_first_mark(&parent->i_fsnotify_marks); + if (inode2) { + iter_info.marks[inode2_type] = + fsnotify_first_mark(&inode2->i_fsnotify_marks); } /* diff --git a/fs/notify/group.c b/fs/notify/group.c index 6a297efc4788..b7d4d64f87c2 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -58,7 +58,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) fsnotify_group_stop_queueing(group); /* Clear all marks for this group and queue them for destruction */ - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES_MASK); + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_ANY); /* * Some marks can still be pinned when waiting for response from diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 29fca3284bb5..54583f62dc44 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static long it_zero = 0; static long it_int_max = INT_MAX; -struct ctl_table inotify_table[] = { +static struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], @@ -87,6 +87,14 @@ struct ctl_table inotify_table[] = { }, { } }; + +static void __init inotify_sysctls_init(void) +{ + register_sysctl("fs/inotify", inotify_table); +} + +#else +#define inotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) @@ -849,6 +857,7 @@ static int __init inotify_user_setup(void) inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; + inotify_sysctls_init(); return 0; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fa1d99101f89..9007d6affff3 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -353,7 +353,7 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) { int type; - fsnotify_foreach_obj_type(type) { + fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); @@ -382,7 +382,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - fsnotify_foreach_obj_type(type) + fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } @@ -496,7 +496,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int type, + unsigned int obj_type, __kernel_fsid_t *fsid) { struct inode *inode = NULL; @@ -507,7 +507,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); - conn->type = type; + conn->type = obj_type; conn->obj = connp; /* Cache fsid of filesystem containing the object */ if (fsid) { @@ -572,7 +572,8 @@ out: * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, + unsigned int obj_type, int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; @@ -580,7 +581,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, int cmp; int err = 0; - if (WARN_ON(!fsnotify_valid_obj_type(type))) + if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* Backend is expected to check for zero fsid (e.g. tmpfs) */ @@ -592,7 +593,8 @@ restart: conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, type, fsid); + err = fsnotify_attach_connector_to_object(connp, obj_type, + fsid); if (err) return err; goto restart; @@ -665,7 +667,7 @@ out_err: * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, unsigned int obj_type, int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; @@ -686,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid); if (ret) goto err; @@ -706,13 +708,14 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int type, int allow_dups, __kernel_fsid_t *fsid) + unsigned int obj_type, int allow_dups, + __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid); mutex_unlock(&group->mark_mutex); return ret; } @@ -747,14 +750,14 @@ EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, - unsigned int type_mask) + unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ - if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) { + if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } @@ -769,7 +772,7 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if ((1U << mark->connector->type) & type_mask) + if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index d563abc3e136..2911c04a33e0 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 8aaec7e0804e..fb825059d488 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -11,7 +11,6 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> -#include <linux/cleancache.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/kernel.h> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bb247bc349e4..bf9357123bc5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; - struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct buffer_head *root_bh; /* * Update the counts and position values within all the diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 68d11c295dd3..498da317580a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1799,20 +1799,20 @@ try_again: */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); - if (ret && ret != -EAGAIN) { - mlog_errno(ret); - goto out_quota; - } + if (ret) { + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } - /* - * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock - * the target page. In this case, we exit with no error and no target - * page. This will trigger the caller, page_mkwrite(), to re-try - * the operation. - */ - if (ret == -EAGAIN) { - BUG_ON(wc->w_target_page); - ret = 0; + mlog_errno(ret); goto out_quota; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f89ffcbd585f..a17be1618bf7 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -379,7 +379,7 @@ static void o2hb_nego_timeout(struct work_struct *work) o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); /* lowest node as master node to make negotiate decision. */ - master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); if (master_node == o2nm_this_node()) { if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 810d32815593..563881ddbf00 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(KTHREAD), }; -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; +static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; +ATTRIBUTE_GROUPS(mlog_default); static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, char *buf) @@ -144,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = { }; static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, + .default_groups = mlog_default_groups, + .sysfs_ops = &mlog_attr_ops, }; static struct kset mlog_kset = { @@ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset) int i = 0; while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + mlog_default_attrs[i] = &mlog_attrs[i].attr; i++; } - mlog_attr_ptrs[i] = NULL; + mlog_default_attrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); mlog_kset.kobj.kset = o2cb_kset; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bd8d534f11cb..f2cc1ff29e6d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_dir_entry *de, *last_de = NULL; char *de_buf, *limit; unsigned long offset = 0; - unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + unsigned int rec_len, new_rec_len, free_space; /* * This calculates how many free bytes we'd have in block zero, should diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9f90fc9551e1..c4eccd499db8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1045,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) int status, ret = 0, i; char *p; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); @@ -1217,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) struct o2nm_node *node; int ret = 0, status, count, i; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9b88219febb5..227da5b1b6ab 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -861,7 +861,7 @@ lookup: * to see if there are any nodes that still need to be * considered. these will not appear in the mle nodemap * but they might own this lockres. wait on them. */ - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -912,7 +912,7 @@ redo_request: dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -1079,7 +1079,7 @@ recheck: sleep = 1; /* have all nodes responded? */ if (voting_done && !*blocked) { - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (dlm->node_num <= bit) { /* my node number is lowest. * now tell other nodes that I am @@ -1234,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, } else { mlog(ML_ERROR, "node down! %d\n", node); if (blocked) { - int lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, 0); + int lowest = find_first_bit(mle->maybe_map, + O2NM_MAX_NODES); /* act like it was never there */ clear_bit(node, mle->maybe_map); @@ -1795,7 +1795,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, "MLE for it! (%.*s)\n", assert->node_idx, namelen, name); } else { - int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ @@ -2521,7 +2521,7 @@ static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, } if (!nonlocal) { - node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES); if (node_ref >= O2NM_MAX_NODES) return 0; } @@ -3303,7 +3303,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm, BUG_ON(mle->type != DLM_MLE_BLOCK); spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit != dead_node) { mlog(0, "mle found, but dead node %u would not have been " "master\n", dead_node); @@ -3542,7 +3542,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) spin_lock(&dlm->master_lock); BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); - BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES)); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 5cd5f7511dac..52ad342fec3e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -451,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c350bd4df770..eedf07ca23ca 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,7 +92,7 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) return 0; /* Another node has this resource with this node as the master */ - bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + bit = find_first_bit(res->refmap, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) return 0; diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index de56e6231af8..1ad7106741f8 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = { &ocfs2_filecheck_attr_set.attr, NULL }; +ATTRIBUTE_GROUPS(ocfs2_filecheck); static void ocfs2_filecheck_release(struct kobject *kobj) { @@ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = { }; static struct kobj_type ocfs2_ktype_filecheck = { - .default_attrs = ocfs2_filecheck_attrs, + .default_groups = ocfs2_filecheck_groups, .sysfs_ops = &ocfs2_filecheck_ops, .release = ocfs2_filecheck_release, }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index dbf9b9e97d74..1887a2708709 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1669,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); goto done; } @@ -1699,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 16f1bfc407f2..dd77b7aaabf5 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -661,42 +661,8 @@ static struct ctl_table ocfs2_nm_table[] = { { } }; -static struct ctl_table ocfs2_mod_table[] = { - { - .procname = "nm", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_nm_table - }, - { } -}; - -static struct ctl_table ocfs2_kern_table[] = { - { - .procname = "ocfs2", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_mod_table - }, - { } -}; - -static struct ctl_table ocfs2_root_table[] = { - { - .procname = "fs", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_kern_table - }, - { } -}; - static struct ctl_table_header *ocfs2_table_header; - /* * Initialization */ @@ -705,7 +671,7 @@ static int __init ocfs2_stack_glue_init(void) { strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); - ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 481017e1dac5..166c8918c825 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1251,26 +1251,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; - int ret = 1; + int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; - if (!buffer_jbd(bg_bh)) + jh = jbd2_journal_grab_journal_head(bg_bh); + if (!jh) return 1; - jbd_lock_bh_journal_head(bg_bh); - if (buffer_jbd(bg_bh)) { - jh = bh2jh(bg_bh); - spin_lock(&jh->b_state_lock); - bg = (struct ocfs2_group_desc *) jh->b_committed_data; - if (bg) - ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); - else - ret = 1; - spin_unlock(&jh->b_state_lock); - } - jbd_unlock_bh_journal_head(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); return ret; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1286b88b6fa1..2772dec9dcea 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -25,7 +25,6 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> -#include <linux/cleancache.h> #include <linux/signal.h> #define CREATE_TRACE_POINTS @@ -2283,7 +2282,6 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs(sb); osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { diff --git a/fs/open.c b/fs/open.c index f732fb94600c..9ff2f621b760 100644 --- a/fs/open.c +++ b/fs/open.c @@ -32,6 +32,7 @@ #include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> +#include <linux/mnt_idmapping.h> #include "internal.h" @@ -640,7 +641,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) int chown_common(const struct path *path, uid_t user, gid_t group) { - struct user_namespace *mnt_userns; + struct user_namespace *mnt_userns, *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -652,8 +653,9 @@ int chown_common(const struct path *path, uid_t user, gid_t group) gid = make_kgid(current_user_ns(), group); mnt_userns = mnt_user_ns(path->mnt); - uid = kuid_from_mnt(mnt_userns, uid); - gid = kgid_from_mnt(mnt_userns, gid); + fs_userns = i_user_ns(inode); + uid = mapped_kuid_user(mnt_userns, fs_userns, uid); + gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 538e839590ef..b501dc07f922 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -176,7 +176,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap) { kfree(bufmap->page_array); kfree(bufmap->desc_array); - kfree(bufmap->buffer_index_array); + bitmap_free(bufmap->buffer_index_array); kfree(bufmap); } @@ -226,8 +226,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) bufmap->desc_size = user_desc->size; bufmap->desc_shift = ilog2(bufmap->desc_size); - bufmap->buffer_index_array = - kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL); + bufmap->buffer_index_array = bitmap_zalloc(bufmap->desc_count, GFP_KERNEL); if (!bufmap->buffer_index_array) goto out_free_bufmap; @@ -250,7 +249,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) out_free_desc_array: kfree(bufmap->desc_array); out_free_index_array: - kfree(bufmap->buffer_index_array); + bitmap_free(bufmap->buffer_index_array); out_free_bufmap: kfree(bufmap); out: diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 3627ea946402..de80b62553bb 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -894,10 +894,11 @@ static struct attribute *orangefs_default_attrs[] = { &perf_time_interval_secs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(orangefs_default); static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = orangefs_default_attrs, + .default_groups = orangefs_default_groups, }; static struct orangefs_attribute acache_hard_limit_attribute = @@ -931,10 +932,11 @@ static struct attribute *acache_orangefs_default_attrs[] = { &acache_timeout_msecs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(acache_orangefs_default); static struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = acache_orangefs_default_attrs, + .default_groups = acache_orangefs_default_groups, }; static struct orangefs_attribute capcache_hard_limit_attribute = @@ -968,10 +970,11 @@ static struct attribute *capcache_orangefs_default_attrs[] = { &capcache_timeout_secs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(capcache_orangefs_default); static struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = capcache_orangefs_default_attrs, + .default_groups = capcache_orangefs_default_groups, }; static struct orangefs_attribute ccache_hard_limit_attribute = @@ -1005,10 +1008,11 @@ static struct attribute *ccache_orangefs_default_attrs[] = { &ccache_timeout_secs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(ccache_orangefs_default); static struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = ccache_orangefs_default_attrs, + .default_groups = ccache_orangefs_default_groups, }; static struct orangefs_attribute ncache_hard_limit_attribute = @@ -1042,10 +1046,11 @@ static struct attribute *ncache_orangefs_default_attrs[] = { &ncache_timeout_msecs_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(ncache_orangefs_default); static struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = ncache_orangefs_default_attrs, + .default_groups = ncache_orangefs_default_groups, }; static struct orangefs_attribute pc_acache_attribute = @@ -1072,10 +1077,11 @@ static struct attribute *pc_orangefs_default_attrs[] = { &pc_ncache_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(pc_orangefs_default); static struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = pc_orangefs_default_attrs, + .default_groups = pc_orangefs_default_groups, }; static struct orangefs_attribute stats_reads_attribute = @@ -1095,10 +1101,11 @@ static struct attribute *stats_orangefs_default_attrs[] = { &stats_writes_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(stats_orangefs_default); static struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .default_attrs = stats_orangefs_default_attrs, + .default_groups = stats_orangefs_default_groups, }; static struct kobject *orangefs_obj; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b193d08a3dc3..e040970408d4 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -145,7 +145,7 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, if (err == -ENOTTY || err == -EINVAL) return 0; pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", - old, err); + old->dentry, err); return err; } @@ -157,7 +157,9 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, */ if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) { err = ovl_set_protattr(inode, new->dentry, &oldfa); - if (err) + if (err == -EPERM) + pr_warn_once("copying fileattr: no xattr on upper\n"); + else if (err) return err; } @@ -167,8 +169,16 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, err = ovl_real_fileattr_get(new, &newfa); if (err) { + /* + * Returning an error if upper doesn't support fileattr will + * result in a regression, so revert to the old behavior. + */ + if (err == -ENOTTY || err == -EINVAL) { + pr_warn_once("copying fileattr: no support on upper\n"); + return 0; + } pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n", - new, err); + new->dentry, err); return err; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 265181c110ae..7bb0a47cb615 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -873,7 +873,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("filesystem on '%s' not supported\n", name); goto out_put; } - if (mnt_user_ns(path->mnt) != &init_user_ns) { + if (is_idmapped_mnt(path->mnt)) { pr_err("idmapped layers are currently not supported\n"); goto out_put; } diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..cc28623a67b6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -25,6 +25,7 @@ #include <linux/fcntl.h> #include <linux/memcontrol.h> #include <linux/watch_queue.h> +#include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/ioctls.h> @@ -50,13 +51,13 @@ * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_size = 1048576; +static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ -unsigned long pipe_user_pages_hard; -unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; +static unsigned long pipe_user_pages_hard; +static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of @@ -1428,6 +1429,60 @@ static struct file_system_type pipe_fs_type = { .kill_sb = kill_anon_super, }; +#ifdef CONFIG_SYSCTL +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + unsigned int val; + + val = round_pipe_size(*lvalp); + if (val == 0) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, NULL); +} + +static struct ctl_table fs_pipe_sysctls[] = { + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { } +}; +#endif + static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); @@ -1439,6 +1494,9 @@ static int __init init_pipe_fs(void) unregister_filesystem(&pipe_fs_type); } } +#ifdef CONFIG_SYSCTL + register_sysctl_init("fs", fs_pipe_sysctls); +#endif return err; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9323a854a60a..80acb6885cf9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -23,6 +23,7 @@ #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/namei.h> +#include <linux/mnt_idmapping.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -374,7 +375,9 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, goto check_perm; break; case ACL_USER: - uid = kuid_into_mnt(mnt_userns, pa->e_uid); + uid = mapped_kuid_fs(mnt_userns, + i_user_ns(inode), + pa->e_uid); if (uid_eq(uid, current_fsuid())) goto mask; break; @@ -387,7 +390,9 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - gid = kgid_into_mnt(mnt_userns, pa->e_gid); + gid = mapped_kgid_fs(mnt_userns, + i_user_ns(inode), + pa->e_gid); if (in_group_p(gid)) { found = 1; if ((pa->e_perm & want) == want) @@ -734,17 +739,17 @@ static void posix_acl_fix_xattr_userns( case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); if (from_user) - uid = kuid_from_mnt(mnt_userns, uid); + uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); else - uid = kuid_into_mnt(mnt_userns, uid); + uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); if (from_user) - gid = kgid_from_mnt(mnt_userns, gid); + gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); else - gid = kgid_into_mnt(mnt_userns, gid); + gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: diff --git a/fs/proc/array.c b/fs/proc/array.c index ff869a66b34e..fd8b0c12b2cb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -92,6 +92,7 @@ #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> +#include <linux/kthread.h> #include <asm/processor.h> #include "internal.h" @@ -102,6 +103,8 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); @@ -468,6 +471,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, u64 cgtime, gtime; unsigned long rsslim = 0; unsigned long flags; + int exit_code = task->exit_code; state = *get_task_state(task); vsize = eip = esp = 0; @@ -531,6 +535,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, maj_flt += sig->maj_flt; thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; + + if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) + exit_code = sig->group_exit_code; } sid = task_session_nr_ns(task, ns); @@ -630,7 +637,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) - seq_put_decimal_ll(m, " ", task->exit_code); + seq_put_decimal_ll(m, " ", exit_code); else seq_puts(m, " 0"); diff --git a/fs/proc/base.c b/fs/proc/base.c index 13eda8de2998..d654ce7150fd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -670,10 +670,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /************************************************************************/ /* permission checks */ -static int proc_fd_access_allowed(struct inode *inode) +static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5b78739e60e4..f2132407e133 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -791,12 +791,6 @@ void proc_remove(struct proc_dir_entry *de) } EXPORT_SYMBOL(proc_remove); -void *PDE_DATA(const struct inode *inode) -{ - return __PDE_DATA(inode); -} -EXPORT_SYMBOL(PDE_DATA); - /* * Pull a user buffer into memory and pass it to the file's write handler if * one is supplied. The ->write() method is permitted to modify the diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 599eb724ff2d..f84355c5a36d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -650,6 +650,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return NULL; } + inode->i_private = de->data; inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); PROC_I(inode)->pde = de; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 03415f3fb3a8..06a80f78433d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -115,11 +115,6 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } -static inline void *__PDE_DATA(const struct inode *inode) -{ - return PDE(inode)->data; -} - static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 15c2e55d2ed2..e1cfeda397f3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -61,15 +61,27 @@ static int seq_open_net(struct inode *inode, struct file *file) } #ifdef CONFIG_NET_NS p->net = net; + netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } +static void seq_file_net_put_net(struct seq_file *seq) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *priv = seq->private; + + put_net_track(priv->net, &priv->ns_tracker); +#else + put_net(&init_net); +#endif +} + static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; - put_net(seq_file_net(seq)); + seq_file_net_put_net(seq); seq_release_private(ino, f); return 0; } @@ -87,7 +99,8 @@ int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; - p->net = get_net(current->nsproxy->net_ns); + p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, + GFP_KERNEL); #endif return 0; } @@ -97,7 +110,7 @@ void bpf_iter_fini_seq_net(void *priv_data) #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; - put_net(p->net); + put_net_track(p->net, &p->ns_tracker); #endif } @@ -125,7 +138,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -140,7 +153,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, @@ -217,7 +230,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -232,7 +245,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5d66faecd4ef..7d9cfc730bd4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> +#include <linux/kmemleak.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -25,15 +26,32 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 }; EXPORT_SYMBOL(sysctl_vals); +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { { } }; +/** + * register_sysctl_mount_point() - registers a sysctl mount point + * @path: path for the mount point + * + * Used to create a permanently empty directory to serve as mount point. + * There are some subtle but important permission checks this allows in the + * case of unprivileged mounts. + */ +struct ctl_table_header *register_sysctl_mount_point(const char *path) +{ + return register_sysctl(path, sysctl_mount_point); +} +EXPORT_SYMBOL(register_sysctl_mount_point); + static bool is_empty_dir(struct ctl_table_header *head) { return head->ctl_table[0].child == sysctl_mount_point; @@ -163,7 +181,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - pr_cont("/%s\n", entry->procname); + pr_cont("%s\n", entry->procname); return -EEXIST; } } @@ -1020,8 +1038,8 @@ failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - pr_cont("/%*.*s %ld\n", - namelen, namelen, name, PTR_ERR(subdir)); + pr_cont("%*.*s %ld\n", namelen, namelen, name, + PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) @@ -1053,7 +1071,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_dir *dir; int ret; - ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); @@ -1384,6 +1401,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init_bases(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; @@ -1597,6 +1646,15 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) } EXPORT_SYMBOL(register_sysctl_table); +int __register_sysctl_base(struct ctl_table *base_table) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(base_table); + kmemleak_not_leak(hdr); + return 0; +} + static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; @@ -1626,7 +1684,7 @@ static void put_links(struct ctl_table_header *header) else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - pr_cont("/%s\n", name); + pr_cont("%s\n", name); } } } @@ -1710,7 +1768,7 @@ int __init proc_sys_init(void) proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return sysctl_init(); + return sysctl_init_bases(); } struct sysctl_alias { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..6e97ed775074 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/vmacache.h> +#include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> @@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: @@ -429,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, } static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty, bool locked) + bool compound, bool young, bool dirty, bool locked, + bool migration) { int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -456,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). + * + * The page_mapcount() is called to get a snapshot of the mapcount. + * Without holding the page lock this snapshot can be slightly wrong as + * we cannot always read the mapcount atomically. It is not safe to + * call page_mapcount() even with PTL held if the page is not mapped, + * especially for migration entries. Treat regular migration entries + * as mapcount == 1. */ - if (page_count(page) == 1) { + if ((page_count(page) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; @@ -506,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); @@ -525,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; page = pfn_swap_entry_to_page(swpent); + } } else { smaps_pte_hole_lookup(addr, walk); return; @@ -535,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), + locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -546,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pmd_present(*pmd)) { /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -553,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) + if (is_migration_entry(entry)) { + migration = true; page = pfn_swap_entry_to_page(entry); + } } if (IS_ERR_OR_NULL(page)) return; @@ -566,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -1367,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; + bool migration = false; if (pte_present(pte)) { if (pm->show_pfn) @@ -1388,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; + migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (page && !PageAnon(page)) flags |= PM_FILE; - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1410,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool migration = false; + ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { u64 flags = 0, frame = 0; @@ -1450,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); + migration = is_migration_entry(entry); page = pfn_swap_entry_to_page(entry); } #endif - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 509f85148fee..702754dd1daf 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -65,8 +65,6 @@ static size_t vmcoredd_orig_sz; static DECLARE_RWSEM(vmcore_cb_rwsem); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); -/* Whether we had a surprise unregistration of a callback. */ -static bool vmcore_cb_unstable; /* Whether the vmcore has been opened once. */ static bool vmcore_opened; @@ -94,10 +92,8 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) * very unusual (e.g., forced driver removal), but we cannot stop * unregistering. */ - if (vmcore_opened) { + if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - vmcore_cb_unstable = true; - } up_write(&vmcore_cb_rwsem); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -108,8 +104,6 @@ static bool pfn_is_ram(unsigned long pfn) bool ret = true; lockdep_assert_held_read(&vmcore_cb_rwsem); - if (unlikely(vmcore_cb_unstable)) - return false; list_for_each_entry(cb, &vmcore_cb_list, next) { if (unlikely(!cb->pfn_is_ram)) @@ -581,7 +575,7 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, * looping over all pages without a reason. */ down_read(&vmcore_cb_rwsem); - if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 392ef5162655..49650e54d2f8 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -80,7 +80,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, fs_infop->str); } - if (mnt_user_ns(mnt) != &init_user_ns) + if (is_idmapped_mnt(mnt)) seq_puts(m, ",idmapped"); } diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 5939595f0115..776cae20af4e 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -64,20 +64,12 @@ static struct ftrace_ops pstore_ftrace_ops __read_mostly = { static DEFINE_MUTEX(pstore_ftrace_lock); static bool pstore_ftrace_enabled; -static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, - size_t count, loff_t *ppos) +static int pstore_set_ftrace_enabled(bool on) { - u8 on; ssize_t ret; - ret = kstrtou8_from_user(buf, count, 2, &on); - if (ret) - return ret; - - mutex_lock(&pstore_ftrace_lock); - - if (!on ^ pstore_ftrace_enabled) - goto out; + if (on == pstore_ftrace_enabled) + return 0; if (on) { ftrace_ops_set_global_filter(&pstore_ftrace_ops); @@ -89,15 +81,30 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, if (ret) { pr_err("%s: unable to %sregister ftrace ops: %zd\n", __func__, on ? "" : "un", ret); - goto err; + } else { + pstore_ftrace_enabled = on; } - pstore_ftrace_enabled = on; -out: - ret = count; -err: + return ret; +} + +static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + u8 on; + ssize_t ret; + + ret = kstrtou8_from_user(buf, count, 2, &on); + if (ret) + return ret; + + mutex_lock(&pstore_ftrace_lock); + ret = pstore_set_ftrace_enabled(on); mutex_unlock(&pstore_ftrace_lock); + if (ret == 0) + ret = count; + return ret; } @@ -117,6 +124,11 @@ static const struct file_operations pstore_knob_fops = { static struct dentry *pstore_ftrace_dir; +static bool record_ftrace; +module_param(record_ftrace, bool, 0400); +MODULE_PARM_DESC(record_ftrace, + "enable ftrace recording immediately (default: off)"); + void pstore_register_ftrace(void) { if (!psinfo->write) @@ -124,6 +136,8 @@ void pstore_register_ftrace(void) pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); + pstore_set_ftrace_enabled(record_ftrace); + debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir, NULL, &pstore_knob_fops); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 22d904bde6ab..a74aef99bd3d 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -690,9 +690,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 0834b101c316..a3e21160b634 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -951,7 +951,9 @@ static int reiserfs_async_progress_wait(struct super_block *s) int depth; depth = reiserfs_write_unlock_nested(s); - congestion_wait(BLK_RW_ASYNC, HZ / 10); + wait_var_event_timeout(&j->j_async_throttle, + atomic_read(&j->j_async_throttle) == 0, + HZ / 10); reiserfs_write_lock_nested(s, depth); } @@ -1058,7 +1060,8 @@ static int flush_commit_list(struct super_block *s, put_bh(tbh) ; } } - atomic_dec(&journal->j_async_throttle); + if (atomic_dec_and_test(&journal->j_async_throttle)) + wake_up_var(&journal->j_async_throttle); for (i = 0; i < (jl->j_len + 1); i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + diff --git a/fs/remap_range.c b/fs/remap_range.c index 6d4a9beaa097..231159682907 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -146,41 +146,41 @@ static int generic_remap_check_len(struct inode *inode_in, } /* Read a page's worth of file data into the page cache. */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +static struct folio *vfs_dedupe_get_folio(struct inode *inode, loff_t pos) { - struct page *page; + struct folio *folio; - page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); + if (IS_ERR(folio)) + return folio; + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-EIO); } - return page; + return folio; } /* - * Lock two pages, ensuring that we lock in offset order if the pages are from - * the same file. + * Lock two folios, ensuring that we lock in offset order if the folios + * are from the same file. */ -static void vfs_lock_two_pages(struct page *page1, struct page *page2) +static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2) { /* Always lock in order of increasing index. */ - if (page1->index > page2->index) - swap(page1, page2); + if (folio1->index > folio2->index) + swap(folio1, folio2); - lock_page(page1); - if (page1 != page2) - lock_page(page2); + folio_lock(folio1); + if (folio1 != folio2) + folio_lock(folio2); } -/* Unlock two pages, being careful not to unlock the same page twice. */ -static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +/* Unlock two folios, being careful not to unlock the same folio twice. */ +static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) { - unlock_page(page1); - if (page1 != page2) - unlock_page(page2); + folio_unlock(folio1); + if (folio1 != folio2) + folio_unlock(folio2); } /* @@ -188,77 +188,71 @@ static void vfs_unlock_two_pages(struct page *page1, struct page *page2) * Caller must have locked both inodes to prevent write races. */ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, + struct inode *dest, loff_t dstoff, loff_t len, bool *is_same) { - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; + bool same = true; + int error = -EINVAL; + while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); + struct folio *src_folio, *dst_folio; + void *src_addr, *dst_addr; + loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff), + PAGE_SIZE - offset_in_page(dstoff)); + cmp_len = min(cmp_len, len); if (cmp_len <= 0) goto out_error; - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); + src_folio = vfs_dedupe_get_folio(src, srcoff); + if (IS_ERR(src_folio)) { + error = PTR_ERR(src_folio); goto out_error; } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - put_page(src_page); + dst_folio = vfs_dedupe_get_folio(dest, dstoff); + if (IS_ERR(dst_folio)) { + error = PTR_ERR(dst_folio); + folio_put(src_folio); goto out_error; } - vfs_lock_two_pages(src_page, dest_page); + vfs_lock_two_folios(src_folio, dst_folio); /* - * Now that we've locked both pages, make sure they're still + * Now that we've locked both folios, make sure they're still * mapped to the file data we're interested in. If not, * someone is invalidating pages on us and we lose. */ - if (!PageUptodate(src_page) || !PageUptodate(dest_page) || - src_page->mapping != src->i_mapping || - dest_page->mapping != dest->i_mapping) { + if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) || + src_folio->mapping != src->i_mapping || + dst_folio->mapping != dest->i_mapping) { same = false; goto unlock; } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); + src_addr = kmap_local_folio(src_folio, + offset_in_folio(src_folio, srcoff)); + dst_addr = kmap_local_folio(dst_folio, + offset_in_folio(dst_folio, dstoff)); - flush_dcache_page(src_page); - flush_dcache_page(dest_page); + flush_dcache_folio(src_folio); + flush_dcache_folio(dst_folio); - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + if (memcmp(src_addr, dst_addr, cmp_len)) same = false; - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); + kunmap_local(dst_addr); + kunmap_local(src_addr); unlock: - vfs_unlock_two_pages(src_page, dest_page); - put_page(dest_page); - put_page(src_page); + vfs_unlock_two_folios(src_folio, dst_folio); + folio_put(dst_folio); + folio_put(src_folio); if (!same) break; srcoff += cmp_len; - destoff += cmp_len; + dstoff += cmp_len; len -= cmp_len; } diff --git a/fs/select.c b/fs/select.c index 945896d0ac9e..0ee55af1a55c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,6 +15,7 @@ * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ +#include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> @@ -458,9 +459,11 @@ get_max: return max; } -#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) -#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) -#define POLLEX_SET (EPOLLPRI) +#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ + EPOLLNVAL) +#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ + EPOLLNVAL) +#define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, @@ -527,6 +530,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) break; if (!(bit & all_bits)) continue; + mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, @@ -534,34 +538,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) mask = vfs_poll(f.file, wait); fdput(f); - if ((mask & POLLIN_SET) && (in & bit)) { - res_in |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLOUT_SET) && (out & bit)) { - res_out |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLEX_SET) && (ex & bit)) { - res_ex |= bit; - retval++; - wait->_qproc = NULL; - } - /* got something, stop busy polling */ - if (retval) { - can_busy_loop = false; - busy_flag = 0; - - /* - * only remember a returned - * POLL_BUSY_LOOP if we asked for it - */ - } else if (busy_flag & mask) - can_busy_loop = true; - } + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } if (res_in) *rinp = res_in; diff --git a/fs/signalfd.c b/fs/signalfd.c index 65ce0e72e7b9..e20d1484c663 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -155,11 +155,12 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info, int nonblock) { + enum pid_type type; ssize_t ret; DECLARE_WAITQUEUE(wait, current); spin_lock_irq(¤t->sighand->siglock); - ret = dequeue_signal(current, &ctx->sigmask, info); + ret = dequeue_signal(current, &ctx->sigmask, info, &type); switch (ret) { case 0: if (!nonblock) @@ -174,7 +175,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info add_wait_queue(¤t->sighand->signalfd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); - ret = dequeue_signal(current, &ctx->sigmask, info); + ret = dequeue_signal(current, &ctx->sigmask, info, &type); if (ret != 0) break; if (signal_pending(current)) { diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 7ccadcbe684b..38b8fc514860 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -449,7 +449,7 @@ struct smb2_netname_neg_context { */ /* Flags */ -#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 +#define SMB2_ACCEPT_TRANSPORT_LEVEL_SECURITY 0x00000001 struct smb2_transport_capabilities_context { __le16 ContextType; /* 6 */ diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index 926f87cd6af0..d51939c43ad7 100644 --- a/fs/smbfs_common/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -95,8 +95,10 @@ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */ +#define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380 #define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 #define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b +#define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index bb44ff4c5cc6..b1b556dbce12 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/xattr.h> +#include <linux/backing-dev.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -112,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } +static int squashfs_bdi_init(struct super_block *sb) +{ + int err; + unsigned int major = MAJOR(sb->s_dev); + unsigned int minor = MINOR(sb->s_dev); + + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + + err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); + if (err) + return err; + + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + + return 0; +} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -127,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); + /* + * squashfs provides 'backing_dev_info' in order to disable read-ahead. For + * squashfs, I/O is not deferred, it is done immediately in readpage, + * which means the user would always have to wait their own I/O. So the effect + * of readahead is very weak for squashfs. squashfs_bdi_init will set + * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for + * squashfs. + */ + err = squashfs_bdi_init(sb); + if (err) { + errorf(fc, "squashfs init bdi failed"); + return err; + } + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); diff --git a/fs/super.c b/fs/super.c index 3bfc0f8fbd5b..f1d4a193602d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -31,7 +31,6 @@ #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> -#include <linux/cleancache.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> @@ -260,7 +259,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; @@ -330,7 +328,6 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - cleancache_invalidate_fs(s); unregister_shrinker(&s->s_shrink); fs->kill_sb(s); @@ -1423,8 +1420,8 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -static int reconfigure_single(struct super_block *s, - int flags, void *data) +int reconfigure_single(struct super_block *s, + int flags, void *data) { struct fs_context *fc; int ret; @@ -1619,11 +1616,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1694,7 +1689,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1706,7 +1708,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1751,7 +1753,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); diff --git a/fs/sync.c b/fs/sync.c index 3ce8e2137f31..c7690016453e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -29,7 +29,7 @@ */ int sync_filesystem(struct super_block *sb) { - int ret; + int ret = 0; /* * We need to be protected against the filesystem going from @@ -52,15 +52,21 @@ int sync_filesystem(struct super_block *sb) * at a time. */ writeback_inodes_sb(sb, WB_REASON_SYNC); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 0); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 0); + if (ret) + return ret; + } ret = sync_blockdev_nowait(sb->s_bdev); - if (ret < 0) + if (ret) return ret; sync_inodes_sb(sb); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); diff --git a/fs/sysctls.c b/fs/sysctls.c new file mode 100644 index 000000000000..c701273c9432 --- /dev/null +++ b/fs/sysctls.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/sys/fs shared sysctls + * + * These sysctls are shared between different filesystems. + */ +#include <linux/init.h> +#include <linux/sysctl.h> + +static struct ctl_table fs_shared_sysctls[] = { + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { } +}; + +DECLARE_SYSCTL_BASE(fs, fs_shared_sysctls); + +static int __init init_fs_sysctls(void) +{ + return register_sysctl_base(fs); +} + +early_initcall(init_fs_sysctls); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 3616839c5c4b..bafc02bf8220 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -109,12 +109,12 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * also the directory that is being deleted. */ inode_unlock(inode); - inode_unlock(dentry->d_inode); + inode_unlock(d_inode(dentry)); ret = tracefs_ops.rmdir(name); inode_lock_nested(inode, I_MUTEX_PARENT); - inode_lock(dentry->d_inode); + inode_lock(d_inode(dentry)); kfree(name); @@ -284,7 +284,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) static int tracefs_apply_options(struct super_block *sb) { struct tracefs_fs_info *fsi = sb->s_fs_info; - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; inode->i_mode &= ~S_IALLUGO; @@ -403,18 +403,18 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - inode_lock(parent->d_inode); - if (unlikely(IS_DEADDIR(parent->d_inode))) + inode_lock(d_inode(parent)); + if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry) && dentry->d_inode) { + if (!IS_ERR(dentry) && d_inode(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { - inode_unlock(parent->d_inode); + inode_unlock(d_inode(parent)); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -423,7 +423,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - inode_unlock(dentry->d_parent->d_inode); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -431,7 +431,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - inode_unlock(dentry->d_parent->d_inode); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -489,7 +489,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); + fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } @@ -516,8 +516,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 5c4b845754a7..314c80b24a76 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -5,7 +5,7 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o -ubifs-y += misc.o +ubifs-y += misc.o sysfs.o ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7c61d0ec0159..dbe72f664abf 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1207,7 +1207,7 @@ out_budg: * @inode1: first inode * @inode2: second inode * @inode3: third inode - * @inode4: fouth inode + * @inode4: fourth inode * * This function is used for 'ubifs_rename()' and @inode1 may be the same as * @inode2 whereas @inode3 and @inode4 may be %NULL. @@ -1233,7 +1233,7 @@ static void lock_4_inodes(struct inode *inode1, struct inode *inode2, * @inode1: first inode * @inode2: second inode * @inode3: third inode - * @inode4: fouth inode + * @inode4: fourth inode */ static void unlock_4_inodes(struct inode *inode1, struct inode *inode2, struct inode *inode3, struct inode *inode4) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index dc3e26e9ed7b..3134d070fcc0 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -692,6 +692,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) for (i = 0; ; i++) { int space_before, space_after; + /* Maybe continue after find and break before find */ + lp.lnum = -1; + cond_resched(); /* Give the commit an opportunity to run */ @@ -753,8 +756,19 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) * caller instead of the original '-EAGAIN'. */ err = ubifs_return_leb(c, lp.lnum); - if (err) + if (err) { ret = err; + /* + * An LEB may always be "taken", + * so setting ubifs to read-only, + * and then executing sync wbuf will + * return -EROFS and enter the "out" + * error branch. + */ + ubifs_ro_mode(c, ret); + } + /* Maybe double return LEB if goto out */ + lp.lnum = -1; break; } goto out; @@ -843,7 +857,8 @@ out: ubifs_wbuf_sync_nolock(wbuf); ubifs_ro_mode(c, ret); mutex_unlock(&wbuf->io_mutex); - ubifs_return_leb(c, lp.lnum); + if (lp.lnum != -1) + ubifs_return_leb(c, lp.lnum); return ret; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 00b61dba62b7..789a7813f3fa 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -194,6 +194,24 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) return err; } +static void record_magic_error(struct ubifs_stats_info *stats) +{ + if (stats) + stats->magic_errors++; +} + +static void record_node_error(struct ubifs_stats_info *stats) +{ + if (stats) + stats->node_errors++; +} + +static void record_crc_error(struct ubifs_stats_info *stats) +{ + if (stats) + stats->crc_errors++; +} + /** * ubifs_check_node - check node. * @c: UBIFS file-system description object @@ -238,6 +256,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, if (!quiet) ubifs_err(c, "bad magic %#08x, expected %#08x", magic, UBIFS_NODE_MAGIC); + record_magic_error(c->stats); err = -EUCLEAN; goto out; } @@ -246,6 +265,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { if (!quiet) ubifs_err(c, "bad node type %d", type); + record_node_error(c->stats); goto out; } @@ -270,6 +290,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int len, if (!quiet) ubifs_err(c, "bad CRC: calculated %#08x, read %#08x", crc, node_crc); + record_crc_error(c->stats); err = -EUCLEAN; goto out; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 5260d3e531bb..4211e4456b1e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -106,7 +106,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) * property values should be @lp->free == @c->leb_size and * @lp->dirty == 0, but that is not the case. The reason is that * the LEB had been garbage collected before it became the bud, - * and there was not commit inbetween. The garbage collector + * and there was no commit in between. The garbage collector * resets the free and dirty space without recording it * anywhere except lprops, so if there was no commit then * lprops does not have that information. diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f0fb25727d96..aa7a1381c457 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1264,6 +1264,10 @@ static int mount_ubifs(struct ubifs_info *c) if (err) return err; + err = ubifs_sysfs_register(c); + if (err) + goto out_debugging; + err = check_volume_empty(c); if (err) goto out_free; @@ -1367,7 +1371,7 @@ static int mount_ubifs(struct ubifs_info *c) sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1375,7 +1379,6 @@ static int mount_ubifs(struct ubifs_info *c) c->bgt_name, err); goto out_wbufs; } - wake_up_process(c->bgt); } err = ubifs_read_master(c); @@ -1641,6 +1644,8 @@ out_free: vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); + ubifs_sysfs_unregister(c); +out_debugging: ubifs_debugging_exit(c); return err; } @@ -1684,6 +1689,7 @@ static void ubifs_umount(struct ubifs_info *c) kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_debugging_exit(c); + ubifs_sysfs_unregister(c); } /** @@ -1780,7 +1786,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1788,7 +1794,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->bgt_name, err); goto out; } - wake_up_process(c->bgt); c->orph_buf = vmalloc(c->leb_size); if (!c->orph_buf) { @@ -1853,7 +1858,6 @@ out: kthread_stop(c->bgt); c->bgt = NULL; } - free_wbufs(c); kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); @@ -2436,14 +2440,20 @@ static int __init ubifs_init(void) dbg_debugfs_init(); + err = ubifs_sysfs_init(); + if (err) + goto out_dbg; + err = register_filesystem(&ubifs_fs_type); if (err) { pr_err("UBIFS error (pid %d): cannot register file system, error %d", current->pid, err); - goto out_dbg; + goto out_sysfs; } return 0; +out_sysfs: + ubifs_sysfs_exit(); out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); @@ -2462,6 +2472,7 @@ static void __exit ubifs_exit(void) WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); + ubifs_sysfs_exit(); ubifs_compressors_exit(); unregister_shrinker(&ubifs_shrinker_info); diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c new file mode 100644 index 000000000000..7acc5a74e5fa --- /dev/null +++ b/fs/ubifs/sysfs.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This file is part of UBIFS. + * + * Copyright (C) 2021 Cisco Systems + * + * Author: Stefan Schaeckeler + */ + + +#include <linux/fs.h> +#include "ubifs.h" + +enum attr_id_t { + attr_errors_magic, + attr_errors_node, + attr_errors_crc, +}; + +struct ubifs_attr { + struct attribute attr; + enum attr_id_t attr_id; +}; + +#define UBIFS_ATTR(_name, _mode, _id) \ +static struct ubifs_attr ubifs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} + +#define UBIFS_ATTR_FUNC(_name, _mode) UBIFS_ATTR(_name, _mode, _name) + +UBIFS_ATTR_FUNC(errors_magic, 0444); +UBIFS_ATTR_FUNC(errors_crc, 0444); +UBIFS_ATTR_FUNC(errors_node, 0444); + +#define ATTR_LIST(name) (&ubifs_attr_##name.attr) + +static struct attribute *ubifs_attrs[] = { + ATTR_LIST(errors_magic), + ATTR_LIST(errors_node), + ATTR_LIST(errors_crc), + NULL, +}; + +static ssize_t ubifs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ubifs_info *sbi = container_of(kobj, struct ubifs_info, + kobj); + + struct ubifs_attr *a = container_of(attr, struct ubifs_attr, attr); + + switch (a->attr_id) { + case attr_errors_magic: + return sysfs_emit(buf, "%u\n", sbi->stats->magic_errors); + case attr_errors_node: + return sysfs_emit(buf, "%u\n", sbi->stats->node_errors); + case attr_errors_crc: + return sysfs_emit(buf, "%u\n", sbi->stats->crc_errors); + } + return 0; +}; + +static void ubifs_sb_release(struct kobject *kobj) +{ + struct ubifs_info *c = container_of(kobj, struct ubifs_info, kobj); + + complete(&c->kobj_unregister); +} + +static const struct sysfs_ops ubifs_attr_ops = { + .show = ubifs_attr_show, +}; + +static struct kobj_type ubifs_sb_ktype = { + .default_attrs = ubifs_attrs, + .sysfs_ops = &ubifs_attr_ops, + .release = ubifs_sb_release, +}; + +static struct kobj_type ubifs_ktype = { + .sysfs_ops = &ubifs_attr_ops, +}; + +static struct kset ubifs_kset = { + .kobj = {.ktype = &ubifs_ktype}, +}; + +int ubifs_sysfs_register(struct ubifs_info *c) +{ + int ret, n; + char dfs_dir_name[UBIFS_DFS_DIR_LEN+1]; + + c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL); + if (!c->stats) { + ret = -ENOMEM; + goto out_last; + } + n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); + + if (n > UBIFS_DFS_DIR_LEN) { + /* The array size is too small */ + ret = -EINVAL; + goto out_free; + } + + c->kobj.kset = &ubifs_kset; + init_completion(&c->kobj_unregister); + + ret = kobject_init_and_add(&c->kobj, &ubifs_sb_ktype, NULL, + "%s", dfs_dir_name); + if (ret) + goto out_put; + + return 0; + +out_put: + kobject_put(&c->kobj); + wait_for_completion(&c->kobj_unregister); +out_free: + kfree(c->stats); +out_last: + ubifs_err(c, "cannot create sysfs entry for ubifs%d_%d, error %d\n", + c->vi.ubi_num, c->vi.vol_id, ret); + return ret; +} + +void ubifs_sysfs_unregister(struct ubifs_info *c) +{ + kobject_del(&c->kobj); + kobject_put(&c->kobj); + wait_for_completion(&c->kobj_unregister); + + kfree(c->stats); +} + +int __init ubifs_sysfs_init(void) +{ + int ret; + + kobject_set_name(&ubifs_kset.kobj, "ubifs"); + ubifs_kset.kobj.parent = fs_kobj; + ret = kset_register(&ubifs_kset); + + return ret; +} + +void ubifs_sysfs_exit(void) +{ + kset_unregister(&ubifs_kset); +} diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c38066ce9ab0..f55828c0a300 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -27,6 +27,8 @@ #include <linux/security.h> #include <linux/xattr.h> #include <linux/random.h> +#include <linux/sysfs.h> +#include <linux/completion.h> #include <crypto/hash_info.h> #include <crypto/hash.h> #include <crypto/algapi.h> @@ -156,6 +158,13 @@ #endif /* + * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi" + * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + */ +#define UBIFS_DFS_DIR_NAME "ubi%d_%d" +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) + +/* * Lockdep classes for UBIFS inode @ui_mutex. */ enum { @@ -990,6 +999,18 @@ struct ubifs_budg_info { int dent_budget; }; +/** + * ubifs_stats_info - per-FS statistics information. + * @magic_errors: number of bad magic numbers (will be reset with a new mount). + * @node_errors: number of bad nodes (will be reset with a new mount). + * @crc_errors: number of bad crcs (will be reset with a new mount). + */ +struct ubifs_stats_info { + unsigned int magic_errors; + unsigned int node_errors; + unsigned int crc_errors; +}; + struct ubifs_debug_info; /** @@ -1251,6 +1272,10 @@ struct ubifs_debug_info; * @mount_opts: UBIFS-specific mount options * * @dbg: debugging-related information + * @stats: statistics exported over sysfs + * + * @kobj: kobject for /sys/fs/ubifs/ + * @kobj_unregister: completion to unregister sysfs kobject */ struct ubifs_info { struct super_block *vfs_sb; @@ -1286,6 +1311,9 @@ struct ubifs_info { spinlock_t cs_lock; wait_queue_head_t cmt_wq; + struct kobject kobj; + struct completion kobj_unregister; + unsigned int big_lpt:1; unsigned int space_fixup:1; unsigned int double_hash:1; @@ -1493,6 +1521,7 @@ struct ubifs_info { struct ubifs_mount_opts mount_opts; struct ubifs_debug_info *dbg; + struct ubifs_stats_info *stats; }; extern struct list_head ubifs_infos; @@ -2072,6 +2101,12 @@ void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, void *out, int *out_len, int compr_type); +/* sysfs.c */ +int ubifs_sysfs_init(void); +void ubifs_sysfs_exit(void); +int ubifs_sysfs_register(struct ubifs_info *c); +void ubifs_sysfs_unregister(struct ubifs_info *c); + #include "debug.h" #include "misc.h" #include "key.h" diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 2ecf0e87660e..b5d611cee749 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -77,6 +77,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) GFP_KERNEL); } if (!iinfo->i_data) { + make_bad_inode(inode); iput(inode); return ERR_PTR(-ENOMEM); } @@ -86,6 +87,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) dinfo->i_location.partitionReferenceNum, start, &err); if (err) { + make_bad_inode(inode); iput(inode); return ERR_PTR(err); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d6b7a50736b..ea8f6cd01f50 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -258,10 +258,6 @@ int udf_expand_file_adinicb(struct inode *inode) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; - struct writeback_control udf_wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - }; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { @@ -305,8 +301,10 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + set_page_dirty(page); + unlock_page(page); up_write(&iinfo->i_data_sem); - err = inode->i_data.a_ops->writepage(page, &udf_wbc); + err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); @@ -317,6 +315,7 @@ int udf_expand_file_adinicb(struct inode *inode) unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } put_page(page); diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore index 361294571ab0..51cdf3fb4dd4 100644 --- a/fs/unicode/.gitignore +++ b/fs/unicode/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only /mkutf8data -/utf8data.h +/utf8data.c diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index 2c27b9a5cd6c..da786a687fdc 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -3,12 +3,13 @@ # UTF-8 normalization # config UNICODE - bool "UTF-8 normalization and casefolding support" + tristate "UTF-8 normalization and casefolding support" help Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding - support. + support. If you say M here the large table of case foldings will + be a separate loadable module that gets requested only when a file + system actually use it. config UNICODE_NORMALIZATION_SELFTEST tristate "Test UTF-8 normalization support" depends on UNICODE - default n diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index b88aecc86550..0cc87423de82 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -1,15 +1,18 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_UNICODE) += unicode.o +ifneq ($(CONFIG_UNICODE),) +obj-y += unicode.o +endif +obj-$(CONFIG_UNICODE) += utf8data.o obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o unicode-y := utf8-norm.o utf8-core.o -$(obj)/utf8-norm.o: $(obj)/utf8data.h +$(obj)/utf8-data.o: $(obj)/utf8data.c -# In the normal build, the checked-in utf8data.h is just shipped. +# In the normal build, the checked-in utf8data.c is just shipped. # -# To generate utf8data.h from UCD, put *.txt files in this directory +# To generate utf8data.c from UCD, put *.txt files in this directory # and pass REGENERATE_UTF8DATA=1 from the command line. ifdef REGENERATE_UTF8DATA @@ -24,15 +27,15 @@ quiet_cmd_utf8data = GEN $@ -t $(srctree)/$(src)/NormalizationTest.txt \ -o $@ -$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE +$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE $(call if_changed,utf8data) else -$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE +$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE $(call if_changed,shipped) endif -targets += utf8data.h +targets += utf8data.c hostprogs += mkutf8data diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index ff2025ac5a32..bc1a7c8b5c8d 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -3287,12 +3287,10 @@ static void write_file(void) open_fail(utf8_name, errno); fprintf(file, "/* This file is generated code, do not edit. */\n"); - fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); - fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); - fprintf(file, "#endif\n"); fprintf(file, "\n"); - fprintf(file, "static const unsigned int utf8vers = %#x;\n", - unicode_maxage); + fprintf(file, "#include <linux/module.h>\n"); + fprintf(file, "#include <linux/kernel.h>\n"); + fprintf(file, "#include \"utf8n.h\"\n"); fprintf(file, "\n"); fprintf(file, "static const unsigned int utf8agetab[] = {\n"); for (i = 0; i != ages_count; i++) @@ -3339,6 +3337,22 @@ static void write_file(void) fprintf(file, "\n"); } fprintf(file, "};\n"); + fprintf(file, "\n"); + fprintf(file, "struct utf8data_table utf8_data_table = {\n"); + fprintf(file, "\t.utf8agetab = utf8agetab,\n"); + fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); + fprintf(file, "\n"); + fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n"); + fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n"); + fprintf(file, "\n"); + fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n"); + fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n"); + fprintf(file, "\n"); + fprintf(file, "\t.utf8data = utf8data,\n"); + fprintf(file, "};\n"); + fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); + fprintf(file, "\n"); + fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); fclose(file); } diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index dc25823bfed9..67aaadc3ab07 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -5,16 +5,13 @@ #include <linux/slab.h> #include <linux/parser.h> #include <linux/errno.h> -#include <linux/unicode.h> #include <linux/stringhash.h> #include "utf8n.h" int utf8_validate(const struct unicode_map *um, const struct qstr *str) { - const struct utf8data *data = utf8nfdi(um->version); - - if (utf8nlen(data, str->name, str->len) < 0) + if (utf8nlen(um, UTF8_NFDI, str->name, str->len) < 0) return -1; return 0; } @@ -23,14 +20,13 @@ EXPORT_SYMBOL(utf8_validate); int utf8_strncmp(const struct unicode_map *um, const struct qstr *s1, const struct qstr *s2) { - const struct utf8data *data = utf8nfdi(um->version); struct utf8cursor cur1, cur2; int c1, c2; - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + if (utf8ncursor(&cur1, um, UTF8_NFDI, s1->name, s1->len) < 0) return -EINVAL; - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + if (utf8ncursor(&cur2, um, UTF8_NFDI, s2->name, s2->len) < 0) return -EINVAL; do { @@ -50,14 +46,13 @@ EXPORT_SYMBOL(utf8_strncmp); int utf8_strncasecmp(const struct unicode_map *um, const struct qstr *s1, const struct qstr *s2) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur1, cur2; int c1, c2; - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0) return -EINVAL; - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + if (utf8ncursor(&cur2, um, UTF8_NFDICF, s2->name, s2->len) < 0) return -EINVAL; do { @@ -81,12 +76,11 @@ int utf8_strncasecmp_folded(const struct unicode_map *um, const struct qstr *cf, const struct qstr *s1) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur1; int c1, c2; int i = 0; - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0) return -EINVAL; do { @@ -105,11 +99,10 @@ EXPORT_SYMBOL(utf8_strncasecmp_folded); int utf8_casefold(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur; size_t nlen = 0; - if (utf8ncursor(&cur, data, str->name, str->len) < 0) + if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0) return -EINVAL; for (nlen = 0; nlen < dlen; nlen++) { @@ -128,12 +121,11 @@ EXPORT_SYMBOL(utf8_casefold); int utf8_casefold_hash(const struct unicode_map *um, const void *salt, struct qstr *str) { - const struct utf8data *data = utf8nfdicf(um->version); struct utf8cursor cur; int c; unsigned long hash = init_name_hash(salt); - if (utf8ncursor(&cur, data, str->name, str->len) < 0) + if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0) return -EINVAL; while ((c = utf8byte(&cur))) { @@ -149,11 +141,10 @@ EXPORT_SYMBOL(utf8_casefold_hash); int utf8_normalize(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { - const struct utf8data *data = utf8nfdi(um->version); struct utf8cursor cur; ssize_t nlen = 0; - if (utf8ncursor(&cur, data, str->name, str->len) < 0) + if (utf8ncursor(&cur, um, UTF8_NFDI, str->name, str->len) < 0) return -EINVAL; for (nlen = 0; nlen < dlen; nlen++) { @@ -167,69 +158,59 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str, } return -EINVAL; } - EXPORT_SYMBOL(utf8_normalize); -static int utf8_parse_version(const char *version, unsigned int *maj, - unsigned int *min, unsigned int *rev) +static const struct utf8data *find_table_version(const struct utf8data *table, + size_t nr_entries, unsigned int version) { - substring_t args[3]; - char version_string[12]; - static const struct match_token token[] = { - {1, "%d.%d.%d"}, - {0, NULL} - }; - - strncpy(version_string, version, sizeof(version_string)); - - if (match_token(version_string, token, args) != 1) - return -EINVAL; - - if (match_int(&args[0], maj) || match_int(&args[1], min) || - match_int(&args[2], rev)) - return -EINVAL; + size_t i = nr_entries - 1; - return 0; + while (version < table[i].maxage) + i--; + if (version > table[i].maxage) + return NULL; + return &table[i]; } -struct unicode_map *utf8_load(const char *version) +struct unicode_map *utf8_load(unsigned int version) { - struct unicode_map *um = NULL; - int unicode_version; - - if (version) { - unsigned int maj, min, rev; - - if (utf8_parse_version(version, &maj, &min, &rev) < 0) - return ERR_PTR(-EINVAL); - - if (!utf8version_is_supported(maj, min, rev)) - return ERR_PTR(-EINVAL); - - unicode_version = UNICODE_AGE(maj, min, rev); - } else { - unicode_version = utf8version_latest(); - printk(KERN_WARNING"UTF-8 version not specified. " - "Assuming latest supported version (%d.%d.%d).", - (unicode_version >> 16) & 0xff, - (unicode_version >> 8) & 0xff, - (unicode_version & 0xff)); - } + struct unicode_map *um; um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); if (!um) return ERR_PTR(-ENOMEM); - - um->charset = "UTF-8"; - um->version = unicode_version; - + um->version = version; + + um->tables = symbol_request(utf8_data_table); + if (!um->tables) + goto out_free_um; + + if (!utf8version_is_supported(um, version)) + goto out_symbol_put; + um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata, + um->tables->utf8nfdidata_size, um->version); + if (!um->ntab[UTF8_NFDI]) + goto out_symbol_put; + um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata, + um->tables->utf8nfdicfdata_size, um->version); + if (!um->ntab[UTF8_NFDICF]) + goto out_symbol_put; return um; + +out_symbol_put: + symbol_put(um->tables); +out_free_um: + kfree(um); + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL(utf8_load); void utf8_unload(struct unicode_map *um) { - kfree(um); + if (um) { + symbol_put(utf8_data_table); + kfree(um); + } } EXPORT_SYMBOL(utf8_unload); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 1d2d2e5b906a..768f8ab448b8 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -6,34 +6,17 @@ #include "utf8n.h" -struct utf8data { - unsigned int maxage; - unsigned int offset; -}; - -#define __INCLUDED_FROM_UTF8NORM_C__ -#include "utf8data.h" -#undef __INCLUDED_FROM_UTF8NORM_C__ - -int utf8version_is_supported(u8 maj, u8 min, u8 rev) +int utf8version_is_supported(const struct unicode_map *um, unsigned int version) { - int i = ARRAY_SIZE(utf8agetab) - 1; - unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); + int i = um->tables->utf8agetab_size - 1; - while (i >= 0 && utf8agetab[i] != 0) { - if (sb_utf8version == utf8agetab[i]) + while (i >= 0 && um->tables->utf8agetab[i] != 0) { + if (version == um->tables->utf8agetab[i]) return 1; i--; } return 0; } -EXPORT_SYMBOL(utf8version_is_supported); - -int utf8version_latest(void) -{ - return utf8vers; -} -EXPORT_SYMBOL(utf8version_latest); /* * UTF-8 valid ranges. @@ -168,7 +151,7 @@ typedef const unsigned char utf8trie_t; * underlying datatype: unsigned char. * * leaf[0]: The unicode version, stored as a generation number that is - * an index into utf8agetab[]. With this we can filter code + * an index into ->utf8agetab[]. With this we can filter code * points based on the unicode version in which they were * defined. The CCC of a non-defined code point is 0. * leaf[1]: Canonical Combining Class. During normalization, we need @@ -316,21 +299,19 @@ utf8hangul(const char *str, unsigned char *hangul) * is well-formed and corresponds to a known unicode code point. The * shorthand for this will be "is valid UTF-8 unicode". */ -static utf8leaf_t *utf8nlookup(const struct utf8data *data, - unsigned char *hangul, const char *s, size_t len) +static utf8leaf_t *utf8nlookup(const struct unicode_map *um, + enum utf8_normalization n, unsigned char *hangul, const char *s, + size_t len) { - utf8trie_t *trie = NULL; + utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; int offlen; int offset; int mask; int node; - if (!data) - return NULL; if (len == 0) return NULL; - trie = utf8data + data->offset; node = 1; while (node) { offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; @@ -392,172 +373,29 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, * * Forwards to utf8nlookup(). */ -static utf8leaf_t *utf8lookup(const struct utf8data *data, - unsigned char *hangul, const char *s) +static utf8leaf_t *utf8lookup(const struct unicode_map *um, + enum utf8_normalization n, unsigned char *hangul, const char *s) { - return utf8nlookup(data, hangul, s, (size_t)-1); -} - -/* - * Maximum age of any character in s. - * Return -1 if s is not valid UTF-8 unicode. - * Return 0 if only non-assigned code points are used. - */ -int utf8agemax(const struct utf8data *data, const char *s) -{ - utf8leaf_t *leaf; - int age = 0; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - - while (*s) { - leaf = utf8lookup(data, hangul, s); - if (!leaf) - return -1; - - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age > age) - age = leaf_age; - s += utf8clen(s); - } - return age; + return utf8nlookup(um, n, hangul, s, (size_t)-1); } -EXPORT_SYMBOL(utf8agemax); - -/* - * Minimum age of any character in s. - * Return -1 if s is not valid UTF-8 unicode. - * Return 0 if non-assigned code points are used. - */ -int utf8agemin(const struct utf8data *data, const char *s) -{ - utf8leaf_t *leaf; - int age; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - age = data->maxage; - while (*s) { - leaf = utf8lookup(data, hangul, s); - if (!leaf) - return -1; - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age < age) - age = leaf_age; - s += utf8clen(s); - } - return age; -} -EXPORT_SYMBOL(utf8agemin); - -/* - * Maximum age of any character in s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -int utf8nagemax(const struct utf8data *data, const char *s, size_t len) -{ - utf8leaf_t *leaf; - int age = 0; - int leaf_age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - - while (len && *s) { - leaf = utf8nlookup(data, hangul, s, len); - if (!leaf) - return -1; - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age > age) - age = leaf_age; - len -= utf8clen(s); - s += utf8clen(s); - } - return age; -} -EXPORT_SYMBOL(utf8nagemax); - -/* - * Maximum age of any character in s, touch at most len bytes. - * Return -1 if s is not valid UTF-8 unicode. - */ -int utf8nagemin(const struct utf8data *data, const char *s, size_t len) -{ - utf8leaf_t *leaf; - int leaf_age; - int age; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - age = data->maxage; - while (len && *s) { - leaf = utf8nlookup(data, hangul, s, len); - if (!leaf) - return -1; - leaf_age = utf8agetab[LEAF_GEN(leaf)]; - if (leaf_age <= data->maxage && leaf_age < age) - age = leaf_age; - len -= utf8clen(s); - s += utf8clen(s); - } - return age; -} -EXPORT_SYMBOL(utf8nagemin); - -/* - * Length of the normalization of s. - * Return -1 if s is not valid UTF-8 unicode. - * - * A string of Default_Ignorable_Code_Point has length 0. - */ -ssize_t utf8len(const struct utf8data *data, const char *s) -{ - utf8leaf_t *leaf; - size_t ret = 0; - unsigned char hangul[UTF8HANGULLEAF]; - - if (!data) - return -1; - while (*s) { - leaf = utf8lookup(data, hangul, s); - if (!leaf) - return -1; - if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) - ret += utf8clen(s); - else if (LEAF_CCC(leaf) == DECOMPOSE) - ret += strlen(LEAF_STR(leaf)); - else - ret += utf8clen(s); - s += utf8clen(s); - } - return ret; -} -EXPORT_SYMBOL(utf8len); /* * Length of the normalization of s, touch at most len bytes. * Return -1 if s is not valid UTF-8 unicode. */ -ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) +ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, + const char *s, size_t len) { utf8leaf_t *leaf; size_t ret = 0; unsigned char hangul[UTF8HANGULLEAF]; - if (!data) - return -1; while (len && *s) { - leaf = utf8nlookup(data, hangul, s, len); + leaf = utf8nlookup(um, n, hangul, s, len); if (!leaf) return -1; - if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) + if (um->tables->utf8agetab[LEAF_GEN(leaf)] > + um->ntab[n]->maxage) ret += utf8clen(s); else if (LEAF_CCC(leaf) == DECOMPOSE) ret += strlen(LEAF_STR(leaf)); @@ -568,7 +406,6 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) } return ret; } -EXPORT_SYMBOL(utf8nlen); /* * Set up an utf8cursor for use by utf8byte(). @@ -580,14 +417,13 @@ EXPORT_SYMBOL(utf8nlen); * * Returns -1 on error, 0 on success. */ -int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s, size_t len) +int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, + enum utf8_normalization n, const char *s, size_t len) { - if (!data) - return -1; if (!s) return -1; - u8c->data = data; + u8c->um = um; + u8c->n = n; u8c->s = s; u8c->p = NULL; u8c->ss = NULL; @@ -604,23 +440,6 @@ int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, return -1; return 0; } -EXPORT_SYMBOL(utf8ncursor); - -/* - * Set up an utf8cursor for use by utf8byte(). - * - * u8c : pointer to cursor. - * data : const struct utf8data to use for normalization. - * s : NUL-terminated string. - * - * Returns -1 on error, 0 on success. - */ -int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s) -{ - return utf8ncursor(u8c, data, s, (unsigned int)-1); -} -EXPORT_SYMBOL(utf8cursor); /* * Get one byte from the normalized form of the string described by u8c. @@ -678,9 +497,9 @@ int utf8byte(struct utf8cursor *u8c) /* Look up the data for the current character. */ if (u8c->p) { - leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); + leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); } else { - leaf = utf8nlookup(u8c->data, u8c->hangul, + leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul, u8c->s, u8c->len); } @@ -690,7 +509,8 @@ int utf8byte(struct utf8cursor *u8c) ccc = LEAF_CCC(leaf); /* Characters that are too new have CCC 0. */ - if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { + if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > + u8c->um->ntab[u8c->n]->maxage) { ccc = STOPPER; } else if (ccc == DECOMPOSE) { u8c->len -= utf8clen(u8c->s); @@ -704,7 +524,7 @@ int utf8byte(struct utf8cursor *u8c) goto ccc_mismatch; } - leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); + leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); if (!leaf) return -1; ccc = LEAF_CCC(leaf); @@ -765,28 +585,10 @@ ccc_mismatch: } } } -EXPORT_SYMBOL(utf8byte); - -const struct utf8data *utf8nfdi(unsigned int maxage) -{ - int i = ARRAY_SIZE(utf8nfdidata) - 1; - - while (maxage < utf8nfdidata[i].maxage) - i--; - if (maxage > utf8nfdidata[i].maxage) - return NULL; - return &utf8nfdidata[i]; -} -EXPORT_SYMBOL(utf8nfdi); - -const struct utf8data *utf8nfdicf(unsigned int maxage) -{ - int i = ARRAY_SIZE(utf8nfdicfdata) - 1; - while (maxage < utf8nfdicfdata[i].maxage) - i--; - if (maxage > utf8nfdicfdata[i].maxage) - return NULL; - return &utf8nfdicfdata[i]; -} -EXPORT_SYMBOL(utf8nfdicf); +#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +EXPORT_SYMBOL_GPL(utf8version_is_supported); +EXPORT_SYMBOL_GPL(utf8nlen); +EXPORT_SYMBOL_GPL(utf8ncursor); +EXPORT_SYMBOL_GPL(utf8byte); +#endif diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 6fe8af7edccb..eb2bbdd688d7 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -18,9 +18,7 @@ unsigned int failed_tests; unsigned int total_tests; /* Tests will be based on this version. */ -#define latest_maj 12 -#define latest_min 1 -#define latest_rev 0 +#define UTF8_LATEST UNICODE_AGE(12, 1, 0) #define _test(cond, func, line, fmt, ...) do { \ total_tests++; \ @@ -160,18 +158,22 @@ static const struct { } }; -static void check_utf8_nfdi(void) +static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n, + const char *s) +{ + return utf8nlen(um, n, s, (size_t)-1); +} + +static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, + enum utf8_normalization n, const char *s) +{ + return utf8ncursor(u8c, um, n, s, (unsigned int)-1); +} + +static void check_utf8_nfdi(struct unicode_map *um) { int i; struct utf8cursor u8c; - const struct utf8data *data; - - data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev)); - if (!data) { - pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", - __func__, latest_maj, latest_min, latest_rev); - return; - } for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); @@ -179,10 +181,11 @@ static void check_utf8_nfdi(void) int j = 0; unsigned char c; - test((utf8len(data, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen)); + test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); + test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == + nlen)); - if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0) + if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) pr_err("can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { @@ -196,18 +199,10 @@ static void check_utf8_nfdi(void) } } -static void check_utf8_nfdicf(void) +static void check_utf8_nfdicf(struct unicode_map *um) { int i; struct utf8cursor u8c; - const struct utf8data *data; - - data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev)); - if (!data) { - pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", - __func__, latest_maj, latest_min, latest_rev); - return; - } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); @@ -215,10 +210,13 @@ static void check_utf8_nfdicf(void) int j = 0; unsigned char c; - test((utf8len(data, nfdicf_test_data[i].str) == nlen)); - test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen)); + test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == + nlen)); + test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == + nlen)); - if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0) + if (utf8cursor(&u8c, um, UTF8_NFDICF, + nfdicf_test_data[i].str) < 0) pr_err("can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { @@ -232,16 +230,9 @@ static void check_utf8_nfdicf(void) } } -static void check_utf8_comparisons(void) +static void check_utf8_comparisons(struct unicode_map *table) { int i; - struct unicode_map *table = utf8_load("12.1.0"); - - if (IS_ERR(table)) { - pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n", - __func__, latest_maj, latest_min, latest_rev); - return; - } for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -262,42 +253,49 @@ static void check_utf8_comparisons(void) test_f(!utf8_strncasecmp(table, &s1, &s2), "%s %s comparison mismatch\n", s1.name, s2.name); } - - utf8_unload(table); } -static void check_supported_versions(void) +static void check_supported_versions(struct unicode_map *um) { /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(7, 0, 0)); + test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(9, 0, 0)); + test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(latest_maj, latest_min, latest_rev)); + test(utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(13, 0, 0)); - test(!utf8version_is_supported(0, 0, 0)); - test(!utf8version_is_supported(-1, -1, -1)); + test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } static int __init init_test_ucd(void) { + struct unicode_map *um; + failed_tests = 0; total_tests = 0; - check_supported_versions(); - check_utf8_nfdi(); - check_utf8_nfdicf(); - check_utf8_comparisons(); + um = utf8_load(UTF8_LATEST); + if (IS_ERR(um)) { + pr_err("%s: Unable to load utf8 table.\n", __func__); + return PTR_ERR(um); + } + + check_supported_versions(um); + check_utf8_nfdi(um); + check_utf8_nfdicf(um); + check_utf8_comparisons(um); if (!failed_tests) pr_info("All %u tests passed\n", total_tests); else pr_err("%u out of %u tests failed\n", failed_tests, total_tests); + utf8_unload(um); return 0; } diff --git a/fs/unicode/utf8data.h_shipped b/fs/unicode/utf8data.c_shipped index 76e4f0e1b089..d9b62901aa96 100644 --- a/fs/unicode/utf8data.h_shipped +++ b/fs/unicode/utf8data.c_shipped @@ -1,9 +1,8 @@ /* This file is generated code, do not edit. */ -#ifndef __INCLUDED_FROM_UTF8NORM_C__ -#error Only nls_utf8-norm.c should include this file. -#endif -static const unsigned int utf8vers = 0xc0100; +#include <linux/module.h> +#include <linux/kernel.h> +#include "utf8n.h" static const unsigned int utf8agetab[] = { 0, @@ -4107,3 +4106,18 @@ static const unsigned char utf8data[64256] = { 0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00, 0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00 }; + +struct utf8data_table utf8_data_table = { + .utf8agetab = utf8agetab, + .utf8agetab_size = ARRAY_SIZE(utf8agetab), + + .utf8nfdicfdata = utf8nfdicfdata, + .utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata), + + .utf8nfdidata = utf8nfdidata, + .utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata), + + .utf8data = utf8data, +}; +EXPORT_SYMBOL_GPL(utf8_data_table); +MODULE_LICENSE("GPL v2"); diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h index 0acd530c2c79..bd00d587747a 100644 --- a/fs/unicode/utf8n.h +++ b/fs/unicode/utf8n.h @@ -11,53 +11,9 @@ #include <linux/export.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/unicode.h> -/* Encoding a unicode version number as a single unsigned int. */ -#define UNICODE_MAJ_SHIFT (16) -#define UNICODE_MIN_SHIFT (8) - -#define UNICODE_AGE(MAJ, MIN, REV) \ - (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ - ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ - ((unsigned int)(REV))) - -/* Highest unicode version supported by the data tables. */ -extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); -extern int utf8version_latest(void); - -/* - * Look for the correct const struct utf8data for a unicode version. - * Returns NULL if the version requested is too new. - * - * Two normalization forms are supported: nfdi and nfdicf. - * - * nfdi: - * - Apply unicode normalization form NFD. - * - Remove any Default_Ignorable_Code_Point. - * - * nfdicf: - * - Apply unicode normalization form NFD. - * - Remove any Default_Ignorable_Code_Point. - * - Apply a full casefold (C + F). - */ -extern const struct utf8data *utf8nfdi(unsigned int maxage); -extern const struct utf8data *utf8nfdicf(unsigned int maxage); - -/* - * Determine the maximum age of any unicode character in the string. - * Returns 0 if only unassigned code points are present. - * Returns -1 if the input is not valid UTF-8. - */ -extern int utf8agemax(const struct utf8data *data, const char *s); -extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len); - -/* - * Determine the minimum age of any unicode character in the string. - * Returns 0 if any unassigned code points are present. - * Returns -1 if the input is not valid UTF-8. - */ -extern int utf8agemin(const struct utf8data *data, const char *s); -extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); +int utf8version_is_supported(const struct unicode_map *um, unsigned int version); /* * Determine the length of the normalized from of the string, @@ -65,8 +21,8 @@ extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); * Returns 0 if only ignorable code points are present. * Returns -1 if the input is not valid UTF-8. */ -extern ssize_t utf8len(const struct utf8data *data, const char *s); -extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); +ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, + const char *s, size_t len); /* Needed in struct utf8cursor below. */ #define UTF8HANGULLEAF (12) @@ -75,7 +31,8 @@ extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); * Cursor structure used by the normalizer. */ struct utf8cursor { - const struct utf8data *data; + const struct unicode_map *um; + enum utf8_normalization n; const char *s; const char *p; const char *ss; @@ -92,10 +49,8 @@ struct utf8cursor { * Returns 0 on success. * Returns -1 on failure. */ -extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s); -extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, - const char *s, size_t len); +int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, + enum utf8_normalization n, const char *s, size_t len); /* * Get the next byte in the normalization. @@ -105,4 +60,24 @@ extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, */ extern int utf8byte(struct utf8cursor *u8c); +struct utf8data { + unsigned int maxage; + unsigned int offset; +}; + +struct utf8data_table { + const unsigned int *utf8agetab; + int utf8agetab_size; + + const struct utf8data *utf8nfdicfdata; + int utf8nfdicfdata_size; + + const struct utf8data *utf8nfdidata; + int utf8nfdidata_size; + + const unsigned char *utf8data; +}; + +extern struct utf8data_table utf8_data_table; + #endif /* UTF8NORM_H */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/poll.h> #include <linux/slab.h> @@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 6f49bf39183c..c557a030acfe 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/backing-dev.h> #include "xfs_message.h" #include "xfs_trace.h" @@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(lflags); } while (1); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4dccd4d90622..74198dd82b03 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,7 +4551,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4598,7 +4598,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index c43877c8a279..505533c43a92 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -93,21 +93,6 @@ struct getbmapx { #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ /* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - -/* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. */ @@ -562,16 +547,10 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; - /* * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. * @@ -781,15 +760,15 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -831,7 +810,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index bed798792226..90aebfe9dc5f 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -281,7 +281,7 @@ xchk_superblock( features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); if ((sb->sb_features2 & features_mask) != (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) - xchk_block_set_corrupt(sc, bp); + xchk_block_set_preen(sc, bp); if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ @@ -290,39 +290,38 @@ xchk_superblock( offsetof(struct xfs_dsb, sb_features_compat))) xchk_block_set_corrupt(sc, bp); } else { - /* Check compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); - if ((sb->sb_features_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + /* compat features must match */ + if (sb->sb_features_compat != + cpu_to_be32(mp->m_sb.sb_features_compat)) xchk_block_set_corrupt(sc, bp); - /* Check ro compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | - XFS_SB_FEAT_RO_COMPAT_FINOBT | - XFS_SB_FEAT_RO_COMPAT_RMAPBT | - XFS_SB_FEAT_RO_COMPAT_REFLINK); - if ((sb->sb_features_ro_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & - features_mask)) + /* ro compat features must match */ + if (sb->sb_features_ro_compat != + cpu_to_be32(mp->m_sb.sb_features_ro_compat)) xchk_block_set_corrupt(sc, bp); - /* Check incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | - XFS_SB_FEAT_INCOMPAT_FTYPE | - XFS_SB_FEAT_INCOMPAT_SPINODES | - XFS_SB_FEAT_INCOMPAT_META_UUID); - if ((sb->sb_features_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_incompat) & - features_mask)) - xchk_block_set_corrupt(sc, bp); + /* + * NEEDSREPAIR is ignored on a secondary super, so we should + * clear it when we find it, though it's not a corruption. + */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & features_mask) + xchk_block_set_preen(sc, bp); - /* Check log incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); - if ((sb->sb_features_log_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & - features_mask)) + /* all other incompat features must match */ + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & ~features_mask) xchk_block_set_corrupt(sc, bp); + /* + * log incompat features protect newer log record types from + * older log recovery code. Log recovery doesn't check the + * secondary supers, so we can clear these if needed. + */ + if (sb->sb_features_log_incompat) + xchk_block_set_preen(sc, bp); + /* Don't care about sb_crc */ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d7bfed52f4cd..6da7f2ca77de 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -52,6 +52,18 @@ xrep_superblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + /* + * Don't write out a secondary super with NEEDSREPAIR or log incompat + * features set, since both are ignored when set on a secondary. + */ + if (xfs_has_crc(mp)) { + struct xfs_dsb *sb = bp->b_addr; + + sb->sb_features_incompat &= + ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sb->sb_features_log_incompat = 0; + } + /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 200a63f58fe7..38897adde7b5 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -497,6 +497,7 @@ STATIC int xchk_directory_leaf1_bestfree( struct xfs_scrub *sc, struct xfs_da_args *args, + xfs_dir2_db_t last_data_db, xfs_dablk_t lblk) { struct xfs_dir3_icleaf_hdr leafhdr; @@ -534,10 +535,14 @@ xchk_directory_leaf1_bestfree( } /* - * There should be as many bestfree slots as there are dir data - * blocks that can fit under i_size. + * There must be enough bestfree slots to cover all the directory data + * blocks that we scanned. It is possible for there to be a hole + * between the last data block and i_disk_size. This seems like an + * oversight to the scrub author, but as we have been writing out + * directories like this (and xfs_repair doesn't mind them) for years, + * that's what we have to check. */ - if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_disk_size)) { + if (bestcount != last_data_db + 1) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } @@ -669,6 +674,7 @@ xchk_directory_blocks( xfs_fileoff_t lblk; struct xfs_iext_cursor icur; xfs_dablk_t dabno; + xfs_dir2_db_t last_data_db = 0; bool found; int is_block = 0; int error; @@ -712,6 +718,7 @@ xchk_directory_blocks( args.geo->fsbcount); lblk < got.br_startoff + got.br_blockcount; lblk += args.geo->fsbcount) { + last_data_db = xfs_dir2_da_to_db(args.geo, lblk); error = xchk_directory_data_bestfree(sc, lblk, is_block); if (error) @@ -734,7 +741,7 @@ xchk_directory_blocks( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } - error = xchk_directory_leaf1_bestfree(sc, &args, + error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db, leaf_lblk); if (error) goto out; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 2405b09d03d0..eac15af7b08c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -233,6 +233,7 @@ xchk_dinode( unsigned long long isize; uint64_t flags2; uint32_t nextents; + prid_t prid; uint16_t flags; uint16_t mode; @@ -267,6 +268,7 @@ xchk_dinode( * so just mark this inode for preening. */ xchk_ino_set_preen(sc, ino); + prid = 0; break; case 2: case 3: @@ -279,12 +281,17 @@ xchk_dinode( if (dip->di_projid_hi != 0 && !xfs_has_projid32(mp)) xchk_ino_set_corrupt(sc, ino); + + prid = be16_to_cpu(dip->di_projid_lo); break; default: xchk_ino_set_corrupt(sc, ino); return; } + if (xfs_has_projid32(mp)) + prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; + /* * di_uid/di_gid -- -1 isn't invalid, but there's no way that * userspace could have created that. @@ -293,6 +300,13 @@ xchk_dinode( dip->di_gid == cpu_to_be32(-1U)) xchk_ino_set_warning(sc, ino); + /* + * project id of -1 isn't supposed to be valid, but the kernel didn't + * always validate that. + */ + if (prid == -1U) + xchk_ino_set_warning(sc, ino); + /* di_format */ switch (dip->di_format) { case XFS_DINODE_FMT_DEV: diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index d6c1b00a4fc8..3c7506c7553c 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -48,10 +48,10 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->flags |= XCHK_HAS_QUOTAOFFLOCK; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); + if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 8f3cba14ada3..1e7b6b209ee8 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -25,6 +25,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -912,11 +913,13 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); sc->mp->m_qflags &= ~flag; spin_lock(&sc->mp->m_sb_lock); sc->mp->m_sb.sb_qflags &= ~flag; spin_unlock(&sc->mp->m_sb_lock); xfs_log_sb(sc->tp); + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); } /* diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8d528d35b725..b11870d07c56 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -173,10 +173,6 @@ xchk_teardown( mnt_drop_write_file(sc->file); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); - if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; - } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 80e5026bba44..3de5287e98d8 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,7 +88,6 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ #define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c8c15c3c3147..9d6a67c7d227 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -136,7 +136,20 @@ done: memalloc_nofs_restore(nofs_flag); } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -157,6 +170,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); + cond_resched(); } } @@ -359,7 +373,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -437,37 +451,37 @@ xfs_prepare_ioend( * see a ENOSPC in writeback). */ static void -xfs_discard_page( - struct page *page, - loff_t fileoff) +xfs_discard_folio( + struct folio *folio, + loff_t pos) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - unsigned int pageoff = offset_in_page(fileoff); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); + size_t offset = offset_in_folio(folio, pos); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) goto out_invalidate; xfs_alert_ratelimited(mp, - "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, fileoff); + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page) - pageoff_fsb); + i_blocks_per_folio(inode, folio) - pageoff_fsb); if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); + iomap_invalidate_folio(folio, offset, folio_size(folio) - offset); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, .prepare_ioend = xfs_prepare_ioend, - .discard_page = xfs_discard_page, + .discard_folio = xfs_discard_folio, }; STATIC int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73a36b7be3bd..eb2e387ba528 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -771,8 +771,7 @@ int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int alloc_type) + xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -851,9 +850,6 @@ xfs_alloc_file_space( rblocks = 0; } - /* - * Allocate and setup the transaction. - */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) @@ -865,14 +861,14 @@ xfs_alloc_file_space( goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, 0, imapp, - &nimaps); + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); if (error) goto error; - /* - * Complete the transaction - */ + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -1001,7 +997,7 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any - * partial block at the beginning and/or end. iomap_zero_range is smart + * partial block at the beginning and/or end. xfs_zero_range is smart * enough to skip any holes, including those we just created, but we * must take care not to zero beyond EOF and enlarge i_size. */ @@ -1009,15 +1005,14 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, - &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, offset, len, NULL); if (error) return error; /* * If we zeroed right up to EOF and EOF straddles a page boundary we * must make sure that the post-EOF area is also zeroed because the - * page could be mmap'd and iomap_zero_range doesn't do that for us. + * page could be mmap'd and xfs_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 9f993168b55b..24b37d211f1d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, int alloc_type); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 631c5a61d89b..b45e0d50a405 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -394,7 +394,7 @@ xfs_buf_alloc_pages( } XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } @@ -1892,6 +1892,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + fs_put_dax(btp->bt_daxdev); kmem_free(btp); } @@ -1932,11 +1933,10 @@ xfs_setsize_buftarg_early( return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } -xfs_buftarg_t * +struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - struct dax_device *dax_dev) + struct block_device *bdev) { xfs_buftarg_t *btp; @@ -1945,7 +1945,7 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = dax_dev; + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 6b0200b8007d..edcb6254fa6a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -89,6 +89,7 @@ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; @@ -338,8 +339,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) /* * Handling of buftargs. */ -extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, struct dax_device *); +struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, + struct block_device *bdev); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 70ca5751b13e..e484251dc9c8 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -816,7 +816,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8310005af00f..a7174a5b3203 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -138,7 +138,8 @@ xfs_dir2_sf_getdents( STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, - struct dir_context *ctx) + struct dir_context *ctx, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ @@ -146,7 +147,6 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; - int lock_mode; unsigned int offset, next_offset; unsigned int end; @@ -156,12 +156,13 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(args->trans, dp, &bp); - xfs_iunlock(dp, lock_mode); if (error) return error; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -344,7 +345,8 @@ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, - size_t bufsize) + size_t bufsize, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; @@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents( xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ - int lock_mode; unsigned int offset = 0; int error = 0; /* error return value */ @@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents( bp = NULL; } - lock_mode = xfs_ilock_data_map_shared(dp); + if (*lock_mode == 0) + *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); - xfs_iunlock(dp, lock_mode); if (error || !bp) break; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. @@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents( * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the - * transaction. The caller should ensure that the inode is locked + * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int @@ -507,8 +511,9 @@ xfs_readdir( size_t bufsize) { struct xfs_da_args args = { NULL }; - int rval; - int v; + unsigned int lock_mode; + int isblock; + int error; trace_xfs_readdir(dp); @@ -516,6 +521,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; @@ -523,13 +529,22 @@ xfs_readdir( args.trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(&args, ctx); - else if ((rval = xfs_dir2_isblock(&args, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(&args, ctx); - else - rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + return xfs_dir2_sf_getdents(&args, ctx); - return rval; + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir2_isblock(&args, &isblock); + if (error) + goto out_unlock; + + if (isblock) { + error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); + goto out_unlock; + } + + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + +out_unlock: + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e48ae227bb11..5afedcbc78c7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -289,13 +289,12 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) */ STATIC int xfs_dquot_disk_alloc( - struct xfs_trans **tpp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - struct xfs_trans *tp = *tpp; - struct xfs_mount *mp = tp->t_mountp; + struct xfs_trans *tp; + struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); @@ -304,29 +303,35 @@ xfs_dquot_disk_alloc( trace_xfs_dqalloc(dqp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + return error; + xfs_ilock(quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, 0); + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return -ESRCH; + error = -ESRCH; + goto err_cancel; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) - return error; + goto err_cancel; /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); if (error) - return error; + goto err_cancel; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -341,7 +346,7 @@ xfs_dquot_disk_alloc( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) - return error; + goto err_cancel; bp->b_ops = &xfs_dquot_buf_ops; /* @@ -371,16 +376,25 @@ xfs_dquot_disk_alloc( * is responsible for unlocking any buffer passed back, either * manually or by committing the transaction. On error, the buffer is * released and not passed back. + * + * Keep the quota inode ILOCKed until after the transaction commit to + * maintain the atomicity of bmap/rmap updates. */ xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp); + error = xfs_trans_commit(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); if (error) { - xfs_trans_bhold_release(*tpp, bp); - xfs_trans_brelse(*tpp, bp); + xfs_buf_relse(bp); return error; } + *bpp = bp; return 0; + +err_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return error; } /* @@ -629,43 +643,6 @@ xfs_dquot_to_disk( ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } -/* Allocate and initialize the dquot buffer for this in-core dquot. */ -static int -xfs_qm_dqread_alloc( - struct xfs_mount *mp, - struct xfs_dquot *dqp, - struct xfs_buf **bpp) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto err; - - error = xfs_dquot_disk_alloc(&tp, dqp, bpp); - if (error) - goto err_cancel; - - error = xfs_trans_commit(tp); - if (error) { - /* - * Buffer was held to the transaction, so we have to unlock it - * manually here because we're not passing it back. - */ - xfs_buf_relse(*bpp); - *bpp = NULL; - goto err; - } - return 0; - -err_cancel: - xfs_trans_cancel(tp); -err: - return error; -} - /* * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. If @can_alloc is true, fill any @@ -689,7 +666,7 @@ xfs_qm_dqread( /* Try to read the buffer, allocating if necessary. */ error = xfs_dquot_disk_read(mp, dqp, &bp); if (error == -ENOENT && can_alloc) - error = xfs_qm_dqread_alloc(mp, dqp, &bp); + error = xfs_dquot_disk_alloc(dqp, &bp); if (error) goto err; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 81c445e9489b..749fd18c4f32 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -213,11 +213,12 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), NULL, }; +ATTRIBUTE_GROUPS(xfs_errortag); static struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, - .default_attrs = xfs_errortag_attrs, + .default_groups = xfs_errortag_groups, }; int diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27594738b0d1..5bddb1e9e0b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -66,40 +66,6 @@ xfs_is_falloc_aligned( return !((pos | len) & mask); } -int -xfs_update_prealloc_flags( - struct xfs_inode *ip, - enum xfs_prealloc_flags flags) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, - 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(flags & XFS_PREALLOC_INVISIBLE)) { - VFS_I(ip)->i_mode &= ~S_ISUID; - if (VFS_I(ip)->i_mode & S_IXGRP) - VFS_I(ip)->i_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (flags & XFS_PREALLOC_SET) - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - if (flags & XFS_PREALLOC_CLEAR) - ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - /* * Fsync operations on directories are much simpler than on regular files, * as there is no file data to flush, and thus also no need for explicit @@ -437,8 +403,7 @@ restart: } trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, - NULL, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; } else @@ -896,6 +861,21 @@ xfs_break_layouts( return error; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -911,7 +891,6 @@ xfs_file_fallocate( struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; - enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -956,6 +935,10 @@ xfs_file_fallocate( goto out_unlock; } + error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1005,8 +988,6 @@ xfs_file_fallocate( } do_file_insert = true; } else { - flags |= XFS_PREALLOC_SET; - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -1052,20 +1033,12 @@ xfs_file_fallocate( } if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + error = xfs_alloc_file_space(ip, offset, len); if (error) goto out_unlock; } } - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - - error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; - /* Change file size if needed */ if (new_size) { struct iattr iattr; @@ -1084,8 +1057,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (xfs_file_sync_writes(file)) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); @@ -1117,21 +1096,6 @@ xfs_file_fadvise( return ret; } -/* Does this file, inode, or mount want synchronous writes? */ -static inline bool xfs_file_sync_writes(struct file *filp) -{ - struct xfs_inode *ip = XFS_I(file_inode(filp)); - - if (xfs_has_wsync(ip->i_mount)) - return true; - if (filp->f_flags & (__O_SYNC | O_DSYNC)) - return true; - if (IS_SYNC(file_inode(filp))) - return true; - - return false; -} - STATIC loff_t xfs_file_remap_range( struct file *file_in, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index da4af2142a2b..9644f938990c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -87,6 +87,7 @@ xfs_inode_alloc( /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; VFS_I(ip)->i_state = 0; + mapping_set_large_folios(VFS_I(ip)->i_mapping); XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -320,6 +321,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; + mapping_set_large_folios(inode->i_mapping); return error; } @@ -749,7 +751,8 @@ again: /* * If we have a real type for an on-disk inode, we can setup the inode - * now. If it's a new inode being created, xfs_ialloc will handle it. + * now. If it's a new inode being created, xfs_init_new_inode will + * handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) xfs_setup_existing_inode(ip); @@ -1851,28 +1854,20 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately, and - * wait for the work to finish. Two pass - queue all the work first pass, wait - * for it in a second pass. + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. */ void xfs_inodegc_flush( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_is_inodegc_enabled(mp)) return; trace_xfs_inodegc_flush(mp, __return_address); xfs_inodegc_queue_all(mp); - - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - flush_work(&gc->work); - } + flush_workqueue(mp->m_inodegc_wq); } /* @@ -1883,18 +1878,12 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_clear_inodegc_enabled(mp)) return; xfs_inodegc_queue_all(mp); + drain_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - cancel_work_sync(&gc->work); - } trace_xfs_inodegc_stop(mp, __return_address); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6771f357ad2c..04bf467b1090 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -988,8 +988,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1142,8 +1142,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c447bf04205a..b7e8f14d9fca 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -462,15 +462,6 @@ xfs_itruncate_extents( } /* from xfs_file.c */ -enum xfs_prealloc_flags { - XFS_PREALLOC_SET = (1 << 1), - XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_SYNC = (1 << 3), - XFS_PREALLOC_INVISIBLE = (1 << 4), -}; - -int xfs_update_prealloc_flags(struct xfs_inode *ip, - enum xfs_prealloc_flags flags); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 174cd8950cb6..2515fe8299e1 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -372,7 +372,7 @@ int xfs_ioc_attr_list( struct xfs_inode *dp, void __user *ubuf, - int bufsize, + size_t bufsize, int flags, struct xfs_attrlist_cursor __user *ucursor) { @@ -627,86 +627,6 @@ xfs_attrmulti_by_handle( return error; } -int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf) -{ - struct inode *inode = file_inode(filp); - struct xfs_inode *ip = XFS_I(inode); - struct iattr iattr; - enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; - uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - int error; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -EPERM; - - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - if (xfs_is_always_cow_inode(ip)) - return -EOPNOTSUPP; - - if (filp->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - if (filp->f_mode & FMODE_NOCMTIME) - flags |= XFS_PREALLOC_INVISIBLE; - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); - if (error) - goto out_unlock; - inode_dio_wait(inode); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += filp->f_pos; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), - &iattr); - if (error) - goto out_unlock; - - error = xfs_update_prealloc_flags(ip, flags); - -out_unlock: - xfs_iunlock(ip, iolock); - mnt_drop_write_file(filp); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1544,7 +1464,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count < 2) return -EINVAL; - if (bmx.bmv_count > ULONG_MAX / recsize) + if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL); @@ -1935,6 +1855,15 @@ xfs_fs_eofblocks_from_user( } /* + * These long-unused ioctls were removed from the official ioctl API in 5.17, + * but retain these definitions so that we can log warnings about them. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) + +/* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. * So we don't "sign flip" like most other routines. This means @@ -1964,13 +1893,11 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: { - xfs_flock64_t bf; - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -EFAULT; - return xfs_ioc_space(filp, &bf); - } + case XFS_IOC_FREESP64: + xfs_warn_once(mp, + "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", + current->comm); + return -ENOTTY; case XFS_IOC_DIOINFO: { struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 28453a6d4461..d4abba2c13c1 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -10,12 +10,6 @@ struct xfs_bstat; struct xfs_ibulk; struct xfs_inogrp; - -extern int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf); - int xfs_ioc_swapext( xfs_swapext_t *sxp); @@ -38,8 +32,9 @@ xfs_readlink_by_handle( int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, uint32_t opcode, void __user *uname, void __user *value, uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, - int flags, struct xfs_attrlist_cursor __user *ucursor); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 8783af203cfc..004ed2a251e8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -28,22 +28,6 @@ #ifdef BROKEN_X86_ALIGNMENT STATIC int -xfs_compat_flock64_copyin( - xfs_flock64_t *bf, - compat_xfs_flock64_t __user *arg32) -{ - if (get_user(bf->l_type, &arg32->l_type) || - get_user(bf->l_whence, &arg32->l_whence) || - get_user(bf->l_start, &arg32->l_start) || - get_user(bf->l_len, &arg32->l_len) || - get_user(bf->l_sysid, &arg32->l_sysid) || - get_user(bf->l_pid, &arg32->l_pid) || - copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -EFAULT; - return 0; -} - -STATIC int xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, compat_xfs_fsop_geom_v1_t __user *arg32) @@ -445,17 +429,6 @@ xfs_file_compat_ioctl( switch (cmd) { #if defined(BROKEN_X86_ALIGNMENT) - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: { - struct xfs_flock64 bf; - - if (xfs_compat_flock64_copyin(&bf, arg)) - return -EFAULT; - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, &bf); - } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 9929482bf358..c14852362fce 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -142,28 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) #ifdef BROKEN_X86_ALIGNMENT -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct compat_xfs_flock64 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} compat_xfs_flock64_t; - -#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) -#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) -#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) -#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) -#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) - typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 093758440ad5..e552ce541ec2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -28,7 +28,6 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" - #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -54,7 +53,8 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - u16 flags) + unsigned int mapping_flags, + u16 iomap_flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -71,16 +71,22 @@ xfs_bmbt_to_iomap( iomap->type = IOMAP_DELALLOC; } else { iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); + if (mapping_flags & IOMAP_DAX) + iomap->addr += target->bt_dax_part_off; + if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else iomap->type = IOMAP_MAPPED; + } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = target->bt_bdev; - iomap->dax_dev = target->bt_daxdev; - iomap->flags = flags; + if (mapping_flags & IOMAP_DAX) + iomap->dax_dev = target->bt_daxdev; + else + iomap->bdev = target->bt_bdev; + iomap->flags = iomap_flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) @@ -188,6 +194,7 @@ xfs_iomap_write_direct( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, + unsigned int flags, struct xfs_bmbt_irec *imap) { struct xfs_mount *mp = ip->i_mount; @@ -229,7 +236,7 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { + if (flags & IOMAP_DAX) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; @@ -620,7 +627,7 @@ imap_needs_alloc( imap->br_startblock == DELAYSTARTBLOCK) return true; /* we convert unwritten extents before copying the data for DAX */ - if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) + if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) return true; return false; } @@ -800,7 +807,7 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); allocate_blocks: error = -EAGAIN; @@ -826,23 +833,24 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - &imap); + flags, &imap); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + iomap_flags | IOMAP_F_NEW); out_found_cow: xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); out_unlock: if (lockmode) @@ -1052,23 +1060,24 @@ retry: */ xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); found_imap: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); found_cow: xfs_iunlock(ip, XFS_ILOCK_EXCL); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1177,7 +1186,8 @@ xfs_read_iomap_begin( if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + shared ? IOMAP_F_SHARED : 0); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1236,7 +1246,8 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1258,7 +1269,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1305,9 +1316,40 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; + +int +xfs_zero_range( + struct xfs_inode *ip, + loff_t pos, + loff_t len, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_zero_range(inode, pos, len, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_zero_range(inode, pos, len, did_zero, + &xfs_buffered_write_iomap_ops); +} + +int +xfs_truncate_page( + struct xfs_inode *ip, + loff_t pos, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_truncate_page(inode, pos, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_truncate_page(inode, pos, did_zero, + &xfs_buffered_write_iomap_ops); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7d3703556d0e..e88dc162c785 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,19 @@ struct xfs_inode; struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); + xfs_fileoff_t count_fsb, unsigned int flags, + struct xfs_bmbt_irec *imap); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); -int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, u16); +int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, + struct xfs_bmbt_irec *imap, unsigned int mapping_flags, + u16 iomap_flags); + +int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, + bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a607d6aca5c4..b79b3846e71b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -511,27 +511,6 @@ xfs_vn_get_link( return ERR_PTR(error); } -STATIC const char * -xfs_vn_get_link_inline( - struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct xfs_inode *ip = XFS_I(inode); - char *link; - - ASSERT(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL); - - /* - * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if - * if_data is junk. - */ - link = ip->i_df.if_u1.if_data; - if (XFS_IS_CORRUPT(ip->i_mount, !link)) - return ERR_PTR(-EFSCORRUPTED); - return link; -} - static uint32_t xfs_stat_blksize( struct xfs_inode *ip) @@ -911,8 +890,8 @@ xfs_setattr_size( */ if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); - error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, oldsize, newsize - oldsize, + &did_zeroing); } else { /* * iomap won't detect a dirty page over an unwritten block (or a @@ -924,8 +903,7 @@ xfs_setattr_size( newsize); if (error) return error; - error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_buffered_write_iomap_ops); + error = xfs_truncate_page(ip, newsize, &did_zeroing); } if (error) @@ -1250,14 +1228,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; -static const struct inode_operations xfs_inline_symlink_inode_operations = { - .get_link = xfs_vn_get_link_inline, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .listxattr = xfs_vn_listxattr, - .update_time = xfs_vn_update_time, -}; - /* Figure out if this file actually supports DAX. */ static bool xfs_inode_supports_dax( @@ -1332,9 +1302,9 @@ xfs_diflags_to_iflags( * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, - * when creating a new inode it is called from xfs_ialloc after setting up the - * inode. These callers have different criteria for clearing XFS_INEW, so leave - * it up to the caller to deal with unlocking the inode appropriately. + * when creating a new inode it is called from xfs_init_new_inode after setting + * up the inode. These callers have different criteria for clearing XFS_INEW, so + * leave it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1408,10 +1378,7 @@ xfs_setup_iops( inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: - if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) - inode->i_op = &xfs_inline_symlink_inode_operations; - else - inode->i_op = &xfs_symlink_inode_operations; + inode->i_op = &xfs_symlink_inode_operations; break; default: inode->i_op = &xfs_inode_operations; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c174262a074e..09a8fba84ff9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -61,6 +61,7 @@ typedef __u32 xfs_nlink_t; #include <linux/ratelimit.h> #include <linux/rhashtable.h> #include <linux/xattr.h> +#include <linux/mnt_idmapping.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 6c93c8ada6f3..83a039762b81 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -103,6 +103,39 @@ xlog_cil_iovec_space( } /* + * shadow buffers can be large, so we need to use kvmalloc() here to ensure + * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall + * back to vmalloc, so we can't actually do anything useful with gfp flags to + * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do + * direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or buddy + * allocator. Hence we have to open code kvmalloc outselves here. + * + * Also, we are in memalloc_nofs_save task context here, so despite the use of + * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This + * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets + * just all pretend this is a GFP_KERNEL context operation.... + */ +static inline void * +xlog_cil_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + +/* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the @@ -203,25 +236,16 @@ xlog_cil_alloc_shadow_bufs( */ if (!lip->li_lv_shadow || buf_size > lip->li_lv_shadow->lv_size) { - /* * We free and allocate here as a realloc would copy - * unnecessary data. We don't use kmem_zalloc() for the + * unnecessary data. We don't use kvzalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. */ kmem_free(lip->li_lv_shadow); + lv = xlog_cil_kvmalloc(buf_size); - /* - * We are in transaction context, which means this - * allocation will pick up GFP_NOFS from the - * memalloc_nofs_save/restore context the transaction - * holds. This means we can use GFP_KERNEL here so the - * generic kvmalloc() code will run vmalloc on - * contiguous page allocation failure as we require. - */ - lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -1442,9 +1466,9 @@ out_shutdown: */ bool xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) + struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; if (list_empty(&lip->li_cil)) return false; @@ -1454,7 +1478,7 @@ xfs_log_item_in_current_chkpt( * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - return lip->li_seq == ctx->sequence; + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 53366cc0bc9e..96c997ed2ec8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -27,7 +27,7 @@ #include "xfs_buf_item.h" #include "xfs_ag.h" #include "xfs_quota.h" - +#include "xfs_reflink.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -3498,6 +3498,28 @@ xlog_recover_finish( xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); + + /* + * Recover any CoW staging blocks that are still referenced by the + * ondisk refcount metadata. During mount there cannot be any live + * staging extents as we have not permitted any user modifications. + * Therefore, it is safe to free them all right now, even on a + * read-only mount. + */ + error = xfs_reflink_recover_cow(log->l_mp); + if (error) { + xfs_alert(log->l_mp, + "Failed to recover leftover CoW staging extents, err %d.", + error); + /* + * If we get an error here, make sure the log is shut down + * but return zero so that any log items committed since the + * end of intents processing can be pushed through the CIL + * and AIL. + */ + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + } + return 0; } @@ -3528,8 +3550,6 @@ xlog_recover_check_summary( uint64_t ifree; int error; - mp = log->l_mp; - freeblks = 0LL; itotal = 0LL; ifree = 0LL; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 359109b6f0d3..bed73e8002a5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -936,15 +936,6 @@ xfs_mountfs( xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_quota; - } - /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) @@ -955,7 +946,6 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); - out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 5e1d29d8b2e7..4abe17312c2b 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -71,6 +71,40 @@ xfs_fs_get_uuid( } /* + * We cannot use file based VFS helpers such as file_modified() to update + * inode state as we modify the data/metadata in the inode here. Hence we have + * to open code the timestamp updates and SUID/SGID stripping. We also need + * to set the inode prealloc flag to ensure that the extents we allocate are not + * removed if the inode is reclaimed from memory before xfs_fs_block_commit() + * is from the client to indicate that data has been written and the file size + * can be extended. + */ +static int +xfs_fs_map_update_inode( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + VFS_I(ip)->i_mode &= ~S_ISUID; + if (VFS_I(ip)->i_mode & S_IXGRP) + VFS_I(ip)->i_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp); +} + +/* * Get a layout for the pNFS client. */ int @@ -155,7 +189,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, &imap); + end_fsb - offset_fsb, 0, &imap); if (error) goto out_unlock; @@ -164,16 +198,18 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_fs_map_update_inode(ip); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); *device_generation = mp->m_generation; return error; out_unlock: @@ -255,7 +291,7 @@ xfs_fs_commit_blocks( length = end - start; if (!length) continue; - + /* * Make sure reads through the pagecache see the new data. */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 47fe60e1a887..7d5a31827681 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -303,13 +303,6 @@ xfs_qm_scall_setqlim( return 0; /* - * We don't want to race with a quotaoff so take the quotaoff lock. - * We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening. - */ - mutex_lock(&q->qi_quotaofflock); - - /* * Get the dquot (locked) before we start, as we need to do a * transaction to allocate it if it doesn't exist. Once we have the * dquot, unlock it so we can start the next transaction safely. We hold @@ -319,7 +312,7 @@ xfs_qm_scall_setqlim( error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); - goto out_unlock; + return error; } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -415,8 +408,6 @@ xfs_qm_scall_setqlim( out_rele: xfs_qm_dqrele(dqp); -out_unlock: - mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cb0edb1d68ef..db70060e7bf6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -749,7 +749,10 @@ xfs_reflink_end_cow( } /* - * Free leftover CoW reservations that didn't get cleaned out. + * Free all CoW staging blocks that are still referenced by the ondisk refcount + * metadata. The ondisk metadata does not track which inode created the + * staging extent, so callers must ensure that there are no cached inodes with + * live CoW staging extents. */ int xfs_reflink_recover_cow( @@ -1269,8 +1272,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, - &xfs_buffered_write_iomap_ops); + return xfs_zero_range(ip, isize, pos - isize, NULL); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 778b57b1f020..4c0dee78b2f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -331,13 +331,34 @@ xfs_set_inode_alloc( return xfs_is_inode32(mp) ? maxagi : agcount; } -static bool -xfs_buftarg_is_dax( - struct super_block *sb, - struct xfs_buftarg *bt) +static int +xfs_setup_dax_always( + struct xfs_mount *mp) { - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, - bdev_nr_sectors(bt->bt_bdev)); + if (!mp->m_ddev_targp->bt_daxdev && + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) { + xfs_alert(mp, + "DAX unsupported by block device. Turning off DAX."); + goto disable_dax; + } + + if (mp->m_super->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "DAX not supported for blocksize. Turning off DAX."); + goto disable_dax; + } + + if (xfs_has_reflink(mp)) { + xfs_alert(mp, "DAX and reflink cannot be used together!"); + return -EINVAL; + } + + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + return 0; + +disable_dax: + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); + return 0; } STATIC int @@ -370,26 +391,19 @@ STATIC void xfs_close_devices( struct xfs_mount *mp) { - struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; - if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); } xfs_free_buftarg(mp->m_ddev_targp); - fs_put_dax(dax_ddev); } /* @@ -407,8 +421,6 @@ xfs_open_devices( struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; - struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); - struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; struct block_device *logdev = NULL, *rtdev = NULL; int error; @@ -418,8 +430,7 @@ xfs_open_devices( if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - goto out; - dax_logdev = fs_dax_get_by_bdev(logdev); + return error; } if (mp->m_rtname) { @@ -433,25 +444,24 @@ xfs_open_devices( error = -EINVAL; goto out_close_rtdev; } - dax_rtdev = fs_dax_get_by_bdev(rtdev); } /* * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -467,14 +477,9 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); out_close_logdev: - if (logdev && logdev != ddev) { + if (logdev && logdev != ddev) xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); - } - out: - fs_put_dax(dax_ddev); return error; } @@ -730,6 +735,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; trace_xfs_fs_sync_fs(mp, __return_address); @@ -739,7 +745,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1593,26 +1602,9 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (xfs_has_dax_always(mp)) { - bool rtdev_is_dax = false, datadev_is_dax; - - xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - - datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); - if (mp->m_rtdev_targp) - rtdev_is_dax = xfs_buftarg_is_dax(sb, - mp->m_rtdev_targp); - if (!rtdev_is_dax && !datadev_is_dax) { - xfs_alert(mp, - "DAX unsupported by block device. Turning off DAX."); - xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); - } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, - "DAX and reflink cannot be used together!"); - error = -EINVAL; + error = xfs_setup_dax_always(mp); + if (error) goto out_filestream_unmount; - } } if (xfs_has_discard(mp)) { @@ -1739,15 +1731,6 @@ xfs_remount_rw( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } xfs_blockgc_start(mp); /* Create the per-AG metadata reservation pool .*/ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index fc2c6a404647..affbedf78160 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_error.h" /* ----- Kernel only functions below ----- */ int @@ -96,17 +97,15 @@ xfs_readlink_bmap_ilocked( int xfs_readlink( - struct xfs_inode *ip, - char *link) + struct xfs_inode *ip, + char *link) { - struct xfs_mount *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = -EFSCORRUPTED; trace_xfs_readlink(ip); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); - if (xfs_is_shutdown(mp)) return -EIO; @@ -121,12 +120,22 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - error = -EFSCORRUPTED; goto out; } - - error = xfs_readlink_bmap_ilocked(ip, link); + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED + * if if_data is junk. + */ + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + goto out; + + memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + error = 0; + } else { + error = xfs_readlink_bmap_ilocked(ip, link); + } out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -184,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8608f804388f..574b80c29fe1 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -67,11 +67,12 @@ static const struct sysfs_ops xfs_sysfs_ops = { static struct attribute *xfs_mp_attrs[] = { NULL, }; +ATTRIBUTE_GROUPS(xfs_mp); struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_mp_attrs, + .default_groups = xfs_mp_groups, }; #ifdef DEBUG @@ -239,11 +240,12 @@ static struct attribute *xfs_dbg_attrs[] = { #endif NULL, }; +ATTRIBUTE_GROUPS(xfs_dbg); struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_dbg_attrs, + .default_groups = xfs_dbg_groups, }; #endif /* DEBUG */ @@ -296,11 +298,12 @@ static struct attribute *xfs_stats_attrs[] = { ATTR_LIST(stats_clear), NULL, }; +ATTRIBUTE_GROUPS(xfs_stats); struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_stats_attrs, + .default_groups = xfs_stats_groups, }; /* xlog */ @@ -381,11 +384,12 @@ static struct attribute *xfs_log_attrs[] = { ATTR_LIST(write_grant_head), NULL, }; +ATTRIBUTE_GROUPS(xfs_log); struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_log_attrs, + .default_groups = xfs_log_groups, }; /* @@ -534,12 +538,12 @@ static struct attribute *xfs_error_attrs[] = { ATTR_LIST(retry_timeout_seconds), NULL, }; - +ATTRIBUTE_GROUPS(xfs_error); static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_error_attrs, + .default_groups = xfs_error_groups, }; static struct kobj_type xfs_error_ktype = { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 234a9d9c2f43..59e2f9031b9f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -942,8 +942,17 @@ xfs_trans_cancel( trace_xfs_trans_cancel(tp, _RET_IP_); - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + /* + * It's never valid to cancel a transaction with deferred ops attached, + * because the transaction is effectively dirty. Complain about this + * loudly before freeing the in-memory defer items. + */ + if (!list_empty(&tp->t_dfops)) { + ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + dirty = true; xfs_defer_cancel(tp); + } /* * See if the caller is relying on us to shut down the |