From 2ff1e97587f4d398686f52c07afde3faf3da4e5c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 19 Mar 2024 10:00:09 +0000 Subject: netfs: Replace PG_fscache by setting folio->private and marking dirty When dirty data is being written to the cache, setting/waiting on/clearing the fscache flag is always done in tandem with setting/waiting on/clearing the writeback flag. The netfslib buffered write routines wait on and set both flags and the write request cleanup clears both flags, so the fscache flag is almost superfluous. The reason it isn't superfluous is because the fscache flag is also used to indicate that data just read from the server is being written to the cache. The flag is used to prevent a race involving overlapping direct-I/O writes to the cache. Change this to indicate that a page is in need of being copied to the cache by placing a magic value in folio->private and marking the folios dirty. Then when the writeback code sees a folio marked in this way, it only writes it to the cache and not to the server. If a folio that has this magic value set is modified, the value is just replaced and the folio will then be uplodaded too. With this, PG_fscache is no longer required by the netfslib core, 9p and afs. Ceph and nfs, however, still need to use the old PG_fscache-based tracking. To deal with this, a flag, NETFS_ICTX_USE_PGPRIV2, now has to be set on the flags in the netfs_inode struct for those filesystems. This reenables the use of PG_fscache in that inode. 9p and afs use the netfslib write helpers so get switched over; cifs, for the moment, does page-by-page manual access to the cache, so doesn't use PG_fscache and is unaffected. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Matthew Wilcox (Oracle) cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: Ilya Dryomov cc: Xiubo Li cc: Steve French cc: Paulo Alcantara cc: Ronnie Sahlberg cc: Shyam Prasad N cc: Tom Talpey cc: Bharath SM cc: Trond Myklebust cc: Anna Schumaker cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 100cbb261269..f5e9c5f84a0c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -143,6 +143,8 @@ struct netfs_inode { #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ #define NETFS_ICTX_NO_WRITE_STREAMING 3 /* Don't engage in write-streaming */ +#define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark + * write to cache on read */ }; /* @@ -165,16 +167,25 @@ struct netfs_folio { unsigned int dirty_len; /* Write-streaming dirty data length */ }; #define NETFS_FOLIO_INFO 0x1UL /* OR'd with folio->private. */ +#define NETFS_FOLIO_COPY_TO_CACHE ((struct netfs_group *)0x356UL) /* Write to the cache only */ -static inline struct netfs_folio *netfs_folio_info(struct folio *folio) +static inline bool netfs_is_folio_info(const void *priv) { - void *priv = folio_get_private(folio); + return (unsigned long)priv & NETFS_FOLIO_INFO; +} - if ((unsigned long)priv & NETFS_FOLIO_INFO) +static inline struct netfs_folio *__netfs_folio_info(const void *priv) +{ + if (netfs_is_folio_info(priv)) return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO); return NULL; } +static inline struct netfs_folio *netfs_folio_info(struct folio *folio) +{ + return __netfs_folio_info(folio_get_private(folio)); +} + static inline struct netfs_group *netfs_folio_group(struct folio *folio) { struct netfs_folio *finfo; @@ -230,6 +241,7 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ + NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_LAUNDER_WRITE, /* This is triggered by ->launder_folio() */ @@ -287,6 +299,8 @@ struct netfs_io_request { #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ +#define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark + * write to cache on read */ const struct netfs_request_ops *netfs_ops; void (*cleanup)(struct netfs_io_request *req); }; -- cgit v1.2.3 From 2e9d7e4b984a61823c41ba65e1b58b98ca9912bb Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 19 Mar 2024 11:13:26 +0000 Subject: mm: Remove the PG_fscache alias for PG_private_2 Remove the PG_fscache alias for PG_private_2 and use the latter directly. Use of this flag for marking pages undergoing writing to the cache should be considered deprecated and the folios should be marked dirty instead and the write done in ->writepages(). Note that PG_private_2 itself should be considered deprecated and up for future removal by the MM folks too. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Matthew Wilcox (Oracle) cc: Ilya Dryomov cc: Xiubo Li cc: Steve French cc: Paulo Alcantara cc: Ronnie Sahlberg cc: Shyam Prasad N cc: Tom Talpey cc: Bharath SM cc: Trond Myklebust cc: Anna Schumaker cc: netfs@lists.linux.dev cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/ceph/addr.c | 11 ++++--- fs/netfs/buffered_read.c | 4 +-- fs/netfs/fscache_io.c | 2 +- fs/netfs/io.c | 2 +- fs/nfs/file.c | 8 ++--- fs/nfs/fscache.h | 4 +-- fs/nfs/write.c | 4 +-- fs/smb/client/file.c | 16 +++++----- include/linux/netfs.h | 80 +++--------------------------------------------- mm/filemap.c | 6 ++-- 10 files changed, 33 insertions(+), 104 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 28ae4976a4f9..18ddacb00511 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -500,7 +500,7 @@ const struct netfs_request_ops ceph_netfs_ops = { #ifdef CONFIG_CEPH_FSCACHE static void ceph_set_page_fscache(struct page *page) { - set_page_fscache(page); + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) @@ -800,7 +800,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return AOP_WRITEPAGE_ACTIVATE; } - wait_on_page_fscache(page); + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { @@ -1075,7 +1075,8 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page) || PageFsCache(page)) { + if (PageWriteback(page) || + PagePrivate2(page) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); @@ -1083,7 +1084,7 @@ get_more_pages: } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); - wait_on_page_fscache(page); + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { @@ -1513,7 +1514,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, if (r < 0) return r; - folio_wait_fscache(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ WARN_ON_ONCE(!folio_test_locked(folio)); *pagep = &folio->page; return 0; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 6d49319c82c6..b3fd6e1fa322 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -70,7 +70,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_fscache(folio); + folio_start_private_2(folio); folio_started = true; } } else { @@ -506,7 +506,7 @@ retry: have_folio: if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) { - ret = folio_wait_fscache_killable(folio); + ret = folio_wait_private_2_killable(folio); if (ret < 0) goto error; } diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 5028f2ae30da..38637e5c9b57 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -183,7 +183,7 @@ void __fscache_clear_page_bits(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, last) { - end_page_fscache(page); + folio_end_private_2(page_folio(page)); } rcu_read_unlock(); } diff --git a/fs/netfs/io.c b/fs/netfs/io.c index b3b9827a9709..60a19f96e0ce 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -129,7 +129,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, continue; unlocked = folio_next_index(folio) - 1; trace_netfs_folio(folio, netfs_folio_trace_end_copy); - folio_end_fscache(folio); + folio_end_private_2(folio); have_unlocked = true; } } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 407c6e15afe2..6bd127e6683d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -433,7 +433,7 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, return; /* Cancel any unstarted writes on this page */ nfs_wb_folio_cancel(inode, folio); - folio_wait_fscache(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio); } @@ -500,7 +500,7 @@ static int nfs_launder_folio(struct folio *folio) dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", inode->i_ino, folio_pos(folio)); - folio_wait_fscache(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ ret = nfs_wb_folio(inode, folio); trace_nfs_launder_folio_done(inode, folio, ret); return ret; @@ -593,8 +593,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) { + if (folio_test_private_2(folio) && /* [DEPRECATED] */ + folio_wait_private_2_killable(folio) < 0) { ret = VM_FAULT_RETRY; goto out; } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 814363d1d7c7..fbed0027996f 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -103,10 +103,10 @@ extern int nfs_netfs_read_folio(struct file *file, struct folio *folio); static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) { - if (folio_test_fscache(folio)) { + if (folio_test_private_2(folio)) { /* [DEPRECATED] */ if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; - folio_wait_fscache(folio); + folio_wait_private_2(folio); } fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host))); return true; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5de85d725fb9..2329cbb0e446 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2120,10 +2120,10 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, if (folio_test_private(src)) return -EBUSY; - if (folio_test_fscache(src)) { + if (folio_test_private_2(src)) { /* [DEPRECATED] */ if (mode == MIGRATE_ASYNC) return -EBUSY; - folio_wait_fscache(src); + folio_wait_private_2(src); } return migrate_folio(mapping, dst, src, mode); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9be37d0fe724..b39caae652f6 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3032,12 +3032,12 @@ lock_again: } if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { + folio_test_private_2(folio)) { /* [DEPRECATED] */ folio_unlock(folio); if (wbc->sync_mode != WB_SYNC_NONE) { folio_wait_writeback(folio); #ifdef CONFIG_CIFS_FSCACHE - folio_wait_fscache(folio); + folio_wait_private_2(folio); #endif goto lock_again; } @@ -4510,8 +4510,8 @@ static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) * be modified. We then assume the entire folio will need writing back. */ #ifdef CONFIG_CIFS_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) + if (folio_test_private_2(folio) && /* [DEPRECATED] */ + folio_wait_private_2_killable(folio) < 0) return VM_FAULT_RETRY; #endif @@ -4977,10 +4977,10 @@ static bool cifs_release_folio(struct folio *folio, gfp_t gfp) { if (folio_test_private(folio)) return 0; - if (folio_test_fscache(folio)) { + if (folio_test_private_2(folio)) { /* [DEPRECATED] */ if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; - folio_wait_fscache(folio); + folio_wait_private_2(folio); } fscache_note_page_release(cifs_inode_cookie(folio->mapping->host)); return true; @@ -4989,7 +4989,7 @@ static bool cifs_release_folio(struct folio *folio, gfp_t gfp) static void cifs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - folio_wait_fscache(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ } static int cifs_launder_folio(struct folio *folio) @@ -5009,7 +5009,7 @@ static int cifs_launder_folio(struct folio *folio) if (folio_clear_dirty_for_io(folio)) rc = cifs_writepage_locked(&folio->page, &wbc); - folio_wait_fscache(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ return rc; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f5e9c5f84a0c..f36a6d8163d1 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,94 +21,22 @@ enum netfs_sreq_ref_trace; -/* - * Overload PG_private_2 to give us PG_fscache - this is used to indicate that - * a page is currently backed by a local disk cache - */ -#define folio_test_fscache(folio) folio_test_private_2(folio) -#define PageFsCache(page) PagePrivate2((page)) -#define SetPageFsCache(page) SetPagePrivate2((page)) -#define ClearPageFsCache(page) ClearPagePrivate2((page)) -#define TestSetPageFsCache(page) TestSetPagePrivate2((page)) -#define TestClearPageFsCache(page) TestClearPagePrivate2((page)) - /** - * folio_start_fscache - Start an fscache write on a folio. + * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] * @folio: The folio. * * Call this function before writing a folio to a local cache. Starting a * second write before the first one finishes is not allowed. + * + * Note that this should no longer be used. */ -static inline void folio_start_fscache(struct folio *folio) +static inline void folio_start_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio); folio_get(folio); folio_set_private_2(folio); } -/** - * folio_end_fscache - End an fscache write on a folio. - * @folio: The folio. - * - * Call this function after the folio has been written to the local cache. - * This will wake any sleepers waiting on this folio. - */ -static inline void folio_end_fscache(struct folio *folio) -{ - folio_end_private_2(folio); -} - -/** - * folio_wait_fscache - Wait for an fscache write on this folio to end. - * @folio: The folio. - * - * If this folio is currently being written to a local cache, wait for - * the write to finish. Another write may start after this one finishes, - * unless the caller holds the folio lock. - */ -static inline void folio_wait_fscache(struct folio *folio) -{ - folio_wait_private_2(folio); -} - -/** - * folio_wait_fscache_killable - Wait for an fscache write on this folio to end. - * @folio: The folio. - * - * If this folio is currently being written to a local cache, wait - * for the write to finish or for a fatal signal to be received. - * Another write may start after this one finishes, unless the caller - * holds the folio lock. - * - * Return: - * - 0 if successful. - * - -EINTR if a fatal signal was encountered. - */ -static inline int folio_wait_fscache_killable(struct folio *folio) -{ - return folio_wait_private_2_killable(folio); -} - -static inline void set_page_fscache(struct page *page) -{ - folio_start_fscache(page_folio(page)); -} - -static inline void end_page_fscache(struct page *page) -{ - folio_end_private_2(page_folio(page)); -} - -static inline void wait_on_page_fscache(struct page *page) -{ - folio_wait_private_2(page_folio(page)); -} - -static inline int wait_on_page_fscache_killable(struct page *page) -{ - return folio_wait_private_2_killable(page_folio(page)); -} - /* Marks used on xarray-based buffers */ #define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ #define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ diff --git a/mm/filemap.c b/mm/filemap.c index 30de18c4fd28..9a2e28bf298a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1540,7 +1540,7 @@ EXPORT_SYMBOL(folio_end_private_2); * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { @@ -1553,8 +1553,8 @@ EXPORT_SYMBOL(folio_wait_private_2); * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a - * fatal signal is received by the calling task. + * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is + * received by the calling task. * * Return: * - 0 if successful. -- cgit v1.2.3 From 93bf1cc0096fa1e02244078db3334ca7fa1d88c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 18 Mar 2024 17:15:57 +0000 Subject: netfs: Make netfs_io_request::subreq_counter an atomic_t Make the netfs_io_request::subreq_counter, used to generate values for netfs_io_subrequest::debug_index, into an atomic_t so that it can be called from the retry thread at the same time as the app thread issuing writes. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org --- fs/netfs/output.c | 2 +- include/linux/netfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/output.c b/fs/netfs/output.c index 625eb68f3e5a..fbdbb4f78234 100644 --- a/fs/netfs/output.c +++ b/fs/netfs/output.c @@ -37,7 +37,7 @@ struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request * subreq->source = dest; subreq->start = start; subreq->len = len; - subreq->debug_index = wreq->subreq_counter++; + subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); switch (subreq->source) { case NETFS_UPLOAD_TO_SERVER: diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f36a6d8163d1..ddafc6ebff42 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -202,7 +202,7 @@ struct netfs_io_request { unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ unsigned int wsize; /* Maximum write size (0 for none) */ - unsigned int subreq_counter; /* Next subreq->debug_index */ + atomic_t subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ -- cgit v1.2.3 From b4ff7b178bda0ce4ec9f799c6a85579ba17f0df3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Mar 2024 17:58:39 +0000 Subject: netfs: Remove ->launder_folio() support Remove support for ->launder_folio() from netfslib and expect filesystems to use filemap_invalidate_inode() instead. netfs_launder_folio() can then be got rid of. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: David Howells cc: Marc Dionne cc: Steve French cc: Matthew Wilcox cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: devel@lists.orangefs.org --- fs/netfs/buffered_write.c | 74 -------------------------------------------- fs/netfs/main.c | 1 - include/linux/netfs.h | 2 -- include/trace/events/netfs.h | 3 -- 4 files changed, 80 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 57c6eab01261..d8f66ce94575 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -1200,77 +1200,3 @@ out: return ret; } EXPORT_SYMBOL(netfs_writepages); - -/* - * Deal with the disposition of a laundered folio. - */ -static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) -{ - if (wreq->error) { - pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); - mapping_set_error(wreq->mapping, wreq->error); - } -} - -/** - * netfs_launder_folio - Clean up a dirty folio that's being invalidated - * @folio: The folio to clean - * - * This is called to write back a folio that's being invalidated when an inode - * is getting torn down. Ideally, writepages would be used instead. - */ -int netfs_launder_folio(struct folio *folio) -{ - struct netfs_io_request *wreq; - struct address_space *mapping = folio->mapping; - struct netfs_folio *finfo = netfs_folio_info(folio); - struct netfs_group *group = netfs_folio_group(folio); - struct bio_vec bvec; - unsigned long long i_size = i_size_read(mapping->host); - unsigned long long start = folio_pos(folio); - size_t offset = 0, len; - int ret = 0; - - if (finfo) { - offset = finfo->dirty_offset; - start += offset; - len = finfo->dirty_len; - } else { - len = folio_size(folio); - } - len = min_t(unsigned long long, len, i_size - start); - - wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); - if (IS_ERR(wreq)) { - ret = PTR_ERR(wreq); - goto out; - } - - if (!folio_clear_dirty_for_io(folio)) - goto out_put; - - trace_netfs_folio(folio, netfs_folio_trace_launder); - - _debug("launder %llx-%llx", start, start + len - 1); - - /* Speculatively write to the cache. We have to fix this up later if - * the store fails. - */ - wreq->cleanup = netfs_cleanup_launder_folio; - - bvec_set_folio(&bvec, folio, len, offset); - iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); - if (group != NETFS_FOLIO_COPY_TO_CACHE) - __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); - ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); - -out_put: - folio_detach_private(folio); - netfs_put_group(group); - kfree(finfo); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); -out: - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(netfs_launder_folio); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index c5a73c9ed126..844efbb2e7a2 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -34,7 +34,6 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_COPY_TO_CACHE] = "CC", [NETFS_WRITEBACK] = "WB", [NETFS_WRITETHROUGH] = "WT", - [NETFS_LAUNDER_WRITE] = "LW", [NETFS_UNBUFFERED_WRITE] = "UW", [NETFS_DIO_READ] = "DR", [NETFS_DIO_WRITE] = "DW", diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ddafc6ebff42..3af589dabd7f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -172,7 +172,6 @@ enum netfs_io_origin { NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ - NETFS_LAUNDER_WRITE, /* This is triggered by ->launder_folio() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ @@ -352,7 +351,6 @@ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); -int netfs_launder_folio(struct folio *folio); /* VMA operations API. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index e03fafb0c1e3..30769103638f 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -26,7 +26,6 @@ #define netfs_write_traces \ EM(netfs_write_trace_copy_to_cache, "COPY2CACH") \ EM(netfs_write_trace_dio_write, "DIO-WRITE") \ - EM(netfs_write_trace_launder, "LAUNDER ") \ EM(netfs_write_trace_unbuffered_write, "UNB-WRITE") \ EM(netfs_write_trace_writeback, "WRITEBACK") \ E_(netfs_write_trace_writethrough, "WRITETHRU") @@ -38,7 +37,6 @@ EM(NETFS_COPY_TO_CACHE, "CC") \ EM(NETFS_WRITEBACK, "WB") \ EM(NETFS_WRITETHROUGH, "WT") \ - EM(NETFS_LAUNDER_WRITE, "LW") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ EM(NETFS_DIO_READ, "DR") \ E_(NETFS_DIO_WRITE, "DW") @@ -135,7 +133,6 @@ EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ - EM(netfs_folio_trace_launder, "launder") \ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_read_gaps, "read-gaps") \ -- cgit v1.2.3 From d9f85a04fb0eee0171f451fb4c4875b8a00eeaec Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Mar 2024 14:37:18 +0000 Subject: netfs: Use mempools for allocating requests and subrequests Use mempools for allocating requests and subrequests in an effort to make sure that allocation always succeeds so that when performing writeback we can always make progress. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/internal.h | 2 ++ fs/netfs/main.c | 51 ++++++++++++++++++++++++++++++++++++++------ fs/netfs/objects.c | 59 ++++++++++++++++++++++++++++++++++----------------- include/linux/netfs.h | 5 +++-- 4 files changed, 89 insertions(+), 28 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 156ab138e224..c67da478cd2b 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -37,6 +37,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); extern unsigned int netfs_debug; extern struct list_head netfs_io_requests; extern spinlock_t netfs_proc_lock; +extern mempool_t netfs_request_pool; +extern mempool_t netfs_subrequest_pool; #ifdef CONFIG_PROC_FS static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 844efbb2e7a2..4805b9377364 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include "internal.h" @@ -23,6 +24,11 @@ unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); +static struct kmem_cache *netfs_request_slab; +static struct kmem_cache *netfs_subrequest_slab; +mempool_t netfs_request_pool; +mempool_t netfs_subrequest_pool; + #ifdef CONFIG_PROC_FS LIST_HEAD(netfs_io_requests); DEFINE_SPINLOCK(netfs_proc_lock); @@ -98,25 +104,54 @@ static int __init netfs_init(void) { int ret = -ENOMEM; + netfs_request_slab = kmem_cache_create("netfs_request", + sizeof(struct netfs_io_request), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, + NULL); + if (!netfs_request_slab) + goto error_req; + + if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0) + goto error_reqpool; + + netfs_subrequest_slab = kmem_cache_create("netfs_subrequest", + sizeof(struct netfs_io_subrequest), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, + NULL); + if (!netfs_subrequest_slab) + goto error_subreq; + + if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0) + goto error_subreqpool; + if (!proc_mkdir("fs/netfs", NULL)) - goto error; + goto error_proc; if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, &netfs_requests_seq_ops)) - goto error_proc; + goto error_procfile; #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, netfs_stats_show)) - goto error_proc; + goto error_procfile; #endif ret = fscache_init(); if (ret < 0) - goto error_proc; + goto error_fscache; return 0; -error_proc: +error_fscache: +error_procfile: remove_proc_entry("fs/netfs", NULL); -error: +error_proc: + mempool_exit(&netfs_subrequest_pool); +error_subreqpool: + kmem_cache_destroy(netfs_subrequest_slab); +error_subreq: + mempool_exit(&netfs_request_pool); +error_reqpool: + kmem_cache_destroy(netfs_request_slab); +error_req: return ret; } fs_initcall(netfs_init); @@ -125,5 +160,9 @@ static void __exit netfs_exit(void) { fscache_exit(); remove_proc_entry("fs/netfs", NULL); + mempool_exit(&netfs_subrequest_pool); + kmem_cache_destroy(netfs_subrequest_slab); + mempool_exit(&netfs_request_pool); + kmem_cache_destroy(netfs_request_slab); } module_exit(netfs_exit); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 8acc03a64059..1a4e2ce735ce 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -6,6 +6,8 @@ */ #include +#include +#include #include "internal.h" /* @@ -20,17 +22,22 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; + mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; + struct kmem_cache *cache = mempool->pool_data; bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || origin == NETFS_DIO_READ || origin == NETFS_DIO_WRITE); bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; - rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), - GFP_KERNEL); - if (!rreq) - return ERR_PTR(-ENOMEM); + for (;;) { + rreq = mempool_alloc(mempool, GFP_KERNEL); + if (rreq) + break; + msleep(10); + } + memset(rreq, 0, kmem_cache_size(cache)); rreq->start = start; rreq->len = len; rreq->upper_len = len; @@ -56,7 +63,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { - kfree(rreq); + mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool); return ERR_PTR(ret); } } @@ -88,6 +95,14 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) } } +static void netfs_free_request_rcu(struct rcu_head *rcu) +{ + struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu); + + mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool); + netfs_stat_d(&netfs_n_rh_rreq); +} + static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = @@ -110,8 +125,7 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } - kfree_rcu(rreq, rcu); - netfs_stat_d(&netfs_n_rh_rreq); + call_rcu(&rreq->rcu, netfs_free_request_rcu); } void netfs_put_request(struct netfs_io_request *rreq, bool was_async, @@ -143,20 +157,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; - - subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?: - sizeof(struct netfs_io_subrequest), - GFP_KERNEL); - if (subreq) { - INIT_WORK(&subreq->work, NULL); - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->ref, 2); - subreq->rreq = rreq; - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); - netfs_get_request(rreq, netfs_rreq_trace_get_subreq); - netfs_stat(&netfs_n_rh_sreq); + mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool; + struct kmem_cache *cache = mempool->pool_data; + + for (;;) { + subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool, + GFP_KERNEL); + if (subreq) + break; + msleep(10); } + memset(subreq, 0, kmem_cache_size(cache)); + INIT_WORK(&subreq->work, NULL); + INIT_LIST_HEAD(&subreq->rreq_link); + refcount_set(&subreq->ref, 2); + subreq->rreq = rreq; + subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); + netfs_get_request(rreq, netfs_rreq_trace_get_subreq); + netfs_stat(&netfs_n_rh_sreq); return subreq; } @@ -178,7 +197,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, trace_netfs_sreq(subreq, netfs_sreq_trace_free); if (rreq->netfs_ops->free_subrequest) rreq->netfs_ops->free_subrequest(subreq); - kfree(subreq); + mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3af589dabd7f..0b6c2c2d3c23 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -20,6 +20,7 @@ #include enum netfs_sreq_ref_trace; +typedef struct mempool_s mempool_t; /** * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] @@ -236,8 +237,8 @@ struct netfs_io_request { * Operations the network filesystem can/must provide to the helpers. */ struct netfs_request_ops { - unsigned int io_request_size; /* Alloc size for netfs_io_request struct */ - unsigned int io_subrequest_size; /* Alloc size for netfs_io_subrequest struct */ + mempool_t *request_pool; + mempool_t *subrequest_pool; int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); void (*free_subrequest)(struct netfs_io_subrequest *rreq); -- cgit v1.2.3 From 7ba167c4c73ed96eb002c98a9d7d49317dfb0191 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 18 Mar 2024 16:57:31 +0000 Subject: netfs: Switch to using unsigned long long rather than loff_t Switch to using unsigned long long rather than loff_t in netfslib to avoid problems with the sign flipping in the maths when we're dealing with the byte at position 0x7fffffffffffffff. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Ilya Dryomov cc: Xiubo Li cc: netfs@lists.linux.dev cc: ceph-devel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org --- fs/cachefiles/io.c | 2 +- fs/ceph/addr.c | 2 +- fs/netfs/buffered_read.c | 4 +++- fs/netfs/buffered_write.c | 2 +- fs/netfs/io.c | 6 +++--- fs/netfs/main.c | 2 +- fs/netfs/output.c | 4 ++-- include/linux/netfs.h | 16 +++++++++------- include/trace/events/netfs.h | 6 +++--- 9 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 1d685357e67f..5ba5c7814fe4 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -493,7 +493,7 @@ out_no_object: * boundary as appropriate. */ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) + unsigned long long i_size) { return cachefiles_do_prepare_read(&subreq->rreq->cache_resources, subreq->start, &subreq->len, i_size, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 74bfd10b1b1a..8c16bc5250ef 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -193,7 +193,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) * block, but do not exceed the file size, unless the original * request already exceeds it. */ - new_end = min(round_up(end, lo->stripe_unit), rreq->i_size); + new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size); if (new_end > end && new_end <= rreq->start + max_len) rreq->len = new_end - rreq->start; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 1622cce535a3..47603f08680e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -130,7 +130,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, - loff_t *_start, size_t *_len, loff_t i_size) + unsigned long long *_start, + unsigned long long *_len, + unsigned long long i_size) { struct netfs_cache_resources *cres = &rreq->cache_resources; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index d8f66ce94575..eba49bfafe64 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -664,7 +664,7 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq) last = (wreq->start + wreq->len - 1) / PAGE_SIZE; xas_for_each(&xas, folio, last) { WARN(!folio_test_writeback(folio), - "bad %zx @%llx page %lx %lx\n", + "bad %llx @%llx page %lx %lx\n", wreq->len, wreq->start, folio->index, last); if ((finfo = netfs_folio_info(folio))) { diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 8de581ac0cfb..6cfecfcd02e1 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -476,7 +476,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, set: if (subreq->len > rreq->len) - pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", + pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n", rreq->debug_id, subreq->debug_index, subreq->len, rreq->len); @@ -513,7 +513,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->start = rreq->start + rreq->submitted; subreq->len = io_iter->count; - _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); + _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); /* Call out to the cache to find out what it can do with the remaining @@ -588,7 +588,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { - _debug("submit %llx + %zx >= %llx", + _debug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); if (rreq->origin == NETFS_DIO_READ && rreq->start + rreq->submitted >= rreq->i_size) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 4805b9377364..5f0f438e5d21 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -62,7 +62,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) rreq = list_entry(v, struct netfs_io_request, proc_link); seq_printf(m, - "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx", + "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx", rreq->debug_id, netfs_origins[rreq->origin], refcount_read(&rreq->ref), diff --git a/fs/netfs/output.c b/fs/netfs/output.c index e586396d6b72..85374322f10f 100644 --- a/fs/netfs/output.c +++ b/fs/netfs/output.c @@ -439,7 +439,7 @@ static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final) */ int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end) { - _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u", + _enter("ic=%zu sb=%llu ws=%u cp=%zu tp=%u", wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end); wreq->iter.count += copied; @@ -457,7 +457,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb) { int ret = -EIOCBQUEUED; - _enter("ic=%zu sb=%zu ws=%u", + _enter("ic=%zu sb=%llu ws=%u", wreq->iter.count, wreq->submitted, wreq->wsize); if (wreq->submitted < wreq->io_iter.count) diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 0b6c2c2d3c23..88269681d4fc 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -149,7 +149,7 @@ struct netfs_io_subrequest { struct work_struct work; struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ - loff_t start; /* Where to start the I/O */ + unsigned long long start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ refcount_t ref; @@ -205,15 +205,15 @@ struct netfs_io_request { atomic_t subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ - size_t submitted; /* Amount submitted for I/O so far */ - size_t len; /* Length of the request */ size_t upper_len; /* Length can be extended to here */ + unsigned long long submitted; /* Amount submitted for I/O so far */ + unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ - loff_t i_size; /* Size of the file */ - loff_t start; /* Start position */ + unsigned long long i_size; /* Size of the file */ + unsigned long long start; /* Start position */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ refcount_t ref; unsigned long flags; @@ -294,13 +294,15 @@ struct netfs_cache_ops { /* Expand readahead request */ void (*expand_readahead)(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size); + unsigned long long *_start, + unsigned long long *_len, + unsigned long long i_size); /* Prepare a read operation, shortening it to a cached/uncached * boundary as appropriate. */ enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq, - loff_t i_size); + unsigned long long i_size); /* Prepare a write operation, working out what part of the write we can * actually do. diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 30769103638f..7126d2ea459c 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -280,7 +280,7 @@ TRACE_EVENT(netfs_sreq, __entry->start = sreq->start; ), - TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx/%zx e=%d", + TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx e=%d", __entry->rreq, __entry->index, __print_symbolic(__entry->source, netfs_sreq_sources), __print_symbolic(__entry->what, netfs_sreq_traces), @@ -320,7 +320,7 @@ TRACE_EVENT(netfs_failure, __entry->start = sreq ? sreq->start : 0; ), - TP_printk("R=%08x[%d] %s f=%02x s=%llx %zx/%zx %s e=%d", + TP_printk("R=%08x[%x] %s f=%02x s=%llx %zx/%zx %s e=%d", __entry->rreq, __entry->index, __print_symbolic(__entry->source, netfs_sreq_sources), __entry->flags, @@ -436,7 +436,7 @@ TRACE_EVENT(netfs_write, __field(unsigned int, cookie ) __field(enum netfs_write_trace, what ) __field(unsigned long long, start ) - __field(size_t, len ) + __field(unsigned long long, len ) ), TP_fast_assign( -- cgit v1.2.3 From 288ace2f57c9d06dd2e42bd80d03747d879a4068 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 18 Mar 2024 16:52:05 +0000 Subject: netfs: New writeback implementation The current netfslib writeback implementation creates writeback requests of contiguous folio data and then separately tiles subrequests over the space twice, once for the server and once for the cache. This creates a few issues: (1) Every time there's a discontiguity or a change between writing to only one destination or writing to both, it must create a new request. This makes it harder to do vectored writes. (2) The folios don't have the writeback mark removed until the end of the request - and a request could be hundreds of megabytes. (3) In future, I want to support a larger cache granularity, which will require aggregation of some folios that contain unmodified data (which only need to go to the cache) and some which contain modifications (which need to be uploaded and stored to the cache) - but, currently, these are treated as discontiguous. There's also a move to get everyone to use writeback_iter() to extract writable folios from the pagecache. That said, currently writeback_iter() has some issues that make it less than ideal: (1) there's no way to cancel the iteration, even if you find a "temporary" error that means the current folio and all subsequent folios are going to fail; (2) there's no way to filter the folios being written back - something that will impact Ceph with it's ordered snap system; (3) and if you get a folio you can't immediately deal with (say you need to flush the preceding writes), you are left with a folio hanging in the locked state for the duration, when really we should unlock it and relock it later. In this new implementation, I use writeback_iter() to pump folios, progressively creating two parallel, but separate streams and cleaning up the finished folios as the subrequests complete. Either or both streams can contain gaps, and the subrequests in each stream can be of variable size, don't need to align with each other and don't need to align with the folios. Indeed, subrequests can cross folio boundaries, may cover several folios or a folio may be spanned by multiple folios, e.g.: +---+---+-----+-----+---+----------+ Folios: | | | | | | | +---+---+-----+-----+---+----------+ +------+------+ +----+----+ Upload: | | |.....| | | +------+------+ +----+----+ +------+------+------+------+------+ Cache: | | | | | | +------+------+------+------+------+ The progressive subrequest construction permits the algorithm to be preparing both the next upload to the server and the next write to the cache whilst the previous ones are already in progress. Throttling can be applied to control the rate of production of subrequests - and, in any case, we probably want to write them to the server in ascending order, particularly if the file will be extended. Content crypto can also be prepared at the same time as the subrequests and run asynchronously, with the prepped requests being stalled until the crypto catches up with them. This might also be useful for transport crypto, but that happens at a lower layer, so probably would be harder to pull off. The algorithm is split into three parts: (1) The issuer. This walks through the data, packaging it up, encrypting it and creating subrequests. The part of this that generates subrequests only deals with file positions and spans and so is usable for DIO/unbuffered writes as well as buffered writes. (2) The collector. This asynchronously collects completed subrequests, unlocks folios, frees crypto buffers and performs any retries. This runs in a work queue so that the issuer can return to the caller for writeback (so that the VM can have its kswapd thread back) or async writes. (3) The retryer. This pauses the issuer, waits for all outstanding subrequests to complete and then goes through the failed subrequests to reissue them. This may involve reprepping them (with cifs, the credits must be renegotiated, and a subrequest may need splitting), and doing RMW for content crypto if there's a conflicting change on the server. [!] Note that some of the functions are prefixed with "new_" to avoid clashes with existing functions. These will be renamed in a later patch that cuts over to the new algorithm. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org --- fs/netfs/Makefile | 4 +- fs/netfs/buffered_write.c | 4 - fs/netfs/internal.h | 27 ++ fs/netfs/objects.c | 17 + fs/netfs/write_collect.c | 803 +++++++++++++++++++++++++++++++++++++++++++ fs/netfs/write_issue.c | 683 ++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 68 +++- include/trace/events/netfs.h | 232 ++++++++++++- 8 files changed, 1829 insertions(+), 9 deletions(-) create mode 100644 fs/netfs/write_collect.c create mode 100644 fs/netfs/write_issue.c (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index d4d1d799819e..1eb86e34b5a9 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -11,7 +11,9 @@ netfs-y := \ main.o \ misc.o \ objects.o \ - output.o + output.o \ + write_collect.o \ + write_issue.o netfs-$(CONFIG_NETFS_STATS) += stats.o diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index eba49bfafe64..84ac95ee4b4d 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -74,16 +74,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, if (file->f_mode & FMODE_READ) goto no_write_streaming; - if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) - goto no_write_streaming; if (netfs_is_cache_enabled(ctx)) { /* We don't want to get a streaming write on a file that loses * caching service temporarily because the backing store got * culled. */ - if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) - set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); goto no_write_streaming; } diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index c67da478cd2b..dc11d1f67363 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -150,6 +150,33 @@ static inline void netfs_stat_d(atomic_t *stat) #define netfs_stat_d(x) do {} while(0) #endif +/* + * write_collect.c + */ +int netfs_folio_written_back(struct folio *folio); +void netfs_write_collection_worker(struct work_struct *work); +void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async); + +/* + * write_issue.c + */ +struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + struct file *file, + loff_t start, + enum netfs_io_origin origin); +void netfs_reissue_write(struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq); +int netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof); +struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *folio, size_t copied, bool to_page_end, + struct folio **writethrough_cache); +int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache); +int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); + /* * Miscellaneous functions. */ diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 1a4e2ce735ce..c90d482b1650 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -47,6 +47,10 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->inode = inode; rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); + rreq->wsize = INT_MAX; + spin_lock_init(&rreq->lock); + INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); + INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); INIT_LIST_HEAD(&rreq->subrequests); INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); @@ -85,6 +89,8 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) { struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream; + int s; while (!list_empty(&rreq->subrequests)) { subreq = list_first_entry(&rreq->subrequests, @@ -93,6 +99,17 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_clear); } + + for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) { + stream = &rreq->io_streams[s]; + while (!list_empty(&stream->subrequests)) { + subreq = list_first_entry(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, was_async, + netfs_sreq_trace_put_clear); + } + } } static void netfs_free_request_rcu(struct rcu_head *rcu) diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c new file mode 100644 index 000000000000..b8c1d3ca724a --- /dev/null +++ b/fs/netfs/write_collect.c @@ -0,0 +1,803 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem write subrequest result collection, assessment + * and retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +/* Notes made in the collector */ +#define HIT_PENDING 0x01 /* A front op was still pending */ +#define SOME_EMPTY 0x02 /* One of more streams are empty */ +#define ALL_EMPTY 0x04 /* All streams are empty */ +#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */ +#define NEED_REASSESS 0x10 /* Need to loop round and reassess */ +#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */ +#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */ +#define BUFFERED 0x80 /* The pagecache needs cleaning up */ +#define NEED_RETRY 0x100 /* A front op requests retrying */ +#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */ + +/* + * Successful completion of write of a folio to the server and/or cache. Note + * that we are not allowed to lock the folio here on pain of deadlocking with + * truncate. + */ +int netfs_folio_written_back(struct folio *folio) +{ + enum netfs_folio_trace why = netfs_folio_trace_clear; + struct netfs_folio *finfo; + struct netfs_group *group = NULL; + int gcount = 0; + + if ((finfo = netfs_folio_info(folio))) { + /* Streaming writes cannot be redirtied whilst under writeback, + * so discard the streaming record. + */ + folio_detach_private(folio); + group = finfo->netfs_group; + gcount++; + kfree(finfo); + why = netfs_folio_trace_clear_s; + goto end_wb; + } + + if ((group = netfs_folio_group(folio))) { + if (group == NETFS_FOLIO_COPY_TO_CACHE) { + why = netfs_folio_trace_clear_cc; + folio_detach_private(folio); + goto end_wb; + } + + /* Need to detach the group pointer if the page didn't get + * redirtied. If it has been redirtied, then it must be within + * the same group. + */ + why = netfs_folio_trace_redirtied; + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + why = netfs_folio_trace_clear_g; + } + } + +end_wb: + trace_netfs_folio(folio, why); + folio_end_writeback(folio); + return gcount; +} + +/* + * Get hold of a folio we have under writeback. We don't want to get the + * refcount on it. + */ +static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos) +{ + XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE); + struct folio *folio; + + rcu_read_lock(); + + for (;;) { + xas_reset(&xas); + folio = xas_load(&xas); + if (xas_retry(&xas, folio)) + continue; + + if (!folio || xa_is_value(folio)) + kdebug("R=%08x: folio %lx (%llx) not present", + wreq->debug_id, xas.xa_index, pos / PAGE_SIZE); + BUG_ON(!folio || xa_is_value(folio)); + + if (folio == xas_reload(&xas)) + break; + } + + rcu_read_unlock(); + + if (WARN_ONCE(!folio_test_writeback(folio), + "R=%08x: folio %lx is not under writeback\n", + wreq->debug_id, folio->index)) { + trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); + } + return folio; +} + +/* + * Unlock any folios we've finished with. + */ +static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, + unsigned long long collected_to, + unsigned int *notes) +{ + for (;;) { + struct folio *folio; + struct netfs_folio *finfo; + unsigned long long fpos, fend; + size_t fsize, flen; + + folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to); + + fpos = folio_pos(folio); + fsize = folio_size(folio); + finfo = netfs_folio_info(folio); + flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize; + + fend = min_t(unsigned long long, fpos + flen, wreq->i_size); + + trace_netfs_collect_folio(wreq, folio, fend, collected_to); + + if (fpos + fsize > wreq->contiguity) { + trace_netfs_collect_contig(wreq, fpos + fsize, + netfs_contig_trace_unlock); + wreq->contiguity = fpos + fsize; + } + + /* Unlock any folio we've transferred all of. */ + if (collected_to < fend) + break; + + wreq->nr_group_rel += netfs_folio_written_back(folio); + wreq->cleaned_to = fpos + fsize; + *notes |= MADE_PROGRESS; + + if (fpos + fsize >= collected_to) + break; + } +} + +/* + * Perform retries on the streams that need it. + */ +static void netfs_retry_write_stream(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) +{ + struct list_head *next; + + _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + + if (unlikely(stream->failed)) + return; + + /* If there's no renegotiation to do, just resend each failed subreq. */ + if (!stream->prepare_write) { + struct netfs_io_subrequest *subreq; + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + break; + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_reissue_write(stream, subreq); + } + } + return; + } + + if (list_empty(&stream->subrequests)) + return; + next = stream->subrequests.next; + + do { + struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; + unsigned long long start, len; + size_t part; + bool boundary = false; + + /* Go through the stream and find the next span of contiguous + * data that we then rejig (cifs, for example, needs the wsize + * renegotiating) and reissue. + */ + from = list_entry(next, struct netfs_io_subrequest, rreq_link); + to = from; + start = from->start + from->transferred; + len = from->len - from->transferred; + + if (test_bit(NETFS_SREQ_FAILED, &from->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + return; + + list_for_each_continue(next, &stream->subrequests) { + subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); + if (subreq->start + subreq->transferred != start + len || + test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) + break; + to = subreq; + len += to->len; + } + + /* Work through the sublist. */ + subreq = from; + list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { + if (!len) + break; + /* Renegotiate max_len (wsize) */ + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + stream->prepare_write(subreq); + + part = min(len, subreq->max_len); + subreq->len = part; + subreq->start = start; + subreq->transferred = 0; + len -= part; + start += part; + if (len && subreq == to && + __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags)) + boundary = true; + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_reissue_write(stream, subreq); + if (subreq == to) + break; + } + + /* If we managed to use fewer subreqs, we can discard the + * excess; if we used the same number, then we're done. + */ + if (!len) { + if (subreq == to) + continue; + list_for_each_entry_safe_from(subreq, tmp, + &stream->subrequests, rreq_link) { + trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + if (subreq == to) + break; + } + continue; + } + + /* We ran out of subrequests, so we need to allocate some more + * and insert them after. + */ + do { + subreq = netfs_alloc_subrequest(wreq); + subreq->source = to->source; + subreq->start = start; + subreq->max_len = len; + subreq->max_nr_segs = INT_MAX; + subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); + subreq->stream_nr = to->stream_nr; + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + + list_add(&subreq->rreq_link, &to->rreq_link); + to = list_next_entry(to, rreq_link); + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + subreq->max_len = min(len, wreq->wsize); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + WARN_ON_ONCE(1); + } + + stream->prepare_write(subreq); + + part = min(len, subreq->max_len); + subreq->len = subreq->transferred + part; + len -= part; + start += part; + if (!len && boundary) { + __set_bit(NETFS_SREQ_BOUNDARY, &to->flags); + boundary = false; + } + + netfs_reissue_write(stream, subreq); + if (!len) + break; + + } while (len); + + } while (!list_is_head(next, &stream->subrequests)); +} + +/* + * Perform retries on the streams that need it. If we're doing content + * encryption and the server copy changed due to a third-party write, we may + * need to do an RMW cycle and also rewrite the data to the cache. + */ +static void netfs_retry_writes(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream; + int s; + + /* Wait for all outstanding I/O to quiesce before performing retries as + * we may need to renegotiate the I/O sizes. + */ + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (!stream->active) + continue; + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + } + } + + // TODO: Enc: Fetch changed partial pages + // TODO: Enc: Reencrypt content if needed. + // TODO: Enc: Wind back transferred point. + // TODO: Enc: Mark cache pages for retry. + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->need_retry) { + stream->need_retry = false; + netfs_retry_write_stream(wreq, stream); + } + } +} + +/* + * Collect and assess the results of various write subrequests. We may need to + * retry some of the results - or even do an RMW cycle for content crypto. + * + * Note that we have a number of parallel, overlapping lists of subrequests, + * one to the server and one to the local cache for example, which may not be + * the same size or starting position and may not even correspond in boundary + * alignment. + */ +static void netfs_collect_write_results(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *front, *remove; + struct netfs_io_stream *stream; + unsigned long long collected_to; + unsigned int notes; + int s; + + _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + trace_netfs_collect(wreq); + trace_netfs_rreq(wreq, netfs_rreq_trace_collect); + +reassess_streams: + smp_rmb(); + collected_to = ULLONG_MAX; + if (wreq->origin == NETFS_WRITEBACK) + notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG; + else if (wreq->origin == NETFS_WRITETHROUGH) + notes = ALL_EMPTY | BUFFERED; + else + notes = ALL_EMPTY; + + /* Remove completed subrequests from the front of the streams and + * advance the completion point on each stream. We stop when we hit + * something that's in progress. The issuer thread may be adding stuff + * to the tail whilst we're doing this. + * + * We must not, however, merge in discontiguities that span whole + * folios that aren't under writeback. This is made more complicated + * by the folios in the gap being of unpredictable sizes - if they even + * exist - but we don't want to look them up. + */ + for (s = 0; s < NR_IO_STREAMS; s++) { + loff_t rstart, rend; + + stream = &wreq->io_streams[s]; + /* Read active flag before list pointers */ + if (!smp_load_acquire(&stream->active)) + continue; + + front = stream->front; + while (front) { + trace_netfs_collect_sreq(wreq, front); + //_debug("sreq [%x] %llx %zx/%zx", + // front->debug_index, front->start, front->transferred, front->len); + + /* Stall if there may be a discontinuity. */ + rstart = round_down(front->start, PAGE_SIZE); + if (rstart > wreq->contiguity) { + if (wreq->contiguity > stream->collected_to) { + trace_netfs_collect_gap(wreq, stream, + wreq->contiguity, 'D'); + stream->collected_to = wreq->contiguity; + } + notes |= REASSESS_DISCONTIG; + break; + } + rend = round_up(front->start + front->len, PAGE_SIZE); + if (rend > wreq->contiguity) { + trace_netfs_collect_contig(wreq, rend, + netfs_contig_trace_collect); + wreq->contiguity = rend; + if (notes & REASSESS_DISCONTIG) + notes |= NEED_REASSESS; + } + notes &= ~MAYBE_DISCONTIG; + + /* Stall if the front is still undergoing I/O. */ + if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { + notes |= HIT_PENDING; + break; + } + smp_rmb(); /* Read counters after I-P flag. */ + + if (stream->failed) { + stream->collected_to = front->start + front->len; + notes |= MADE_PROGRESS | SAW_FAILURE; + goto cancel; + } + if (front->start + front->transferred > stream->collected_to) { + stream->collected_to = front->start + front->transferred; + stream->transferred = stream->collected_to - wreq->start; + notes |= MADE_PROGRESS; + } + if (test_bit(NETFS_SREQ_FAILED, &front->flags)) { + stream->failed = true; + stream->error = front->error; + if (stream->source == NETFS_UPLOAD_TO_SERVER) + mapping_set_error(wreq->mapping, front->error); + notes |= NEED_REASSESS | SAW_FAILURE; + break; + } + if (front->transferred < front->len) { + stream->need_retry = true; + notes |= NEED_RETRY | MADE_PROGRESS; + break; + } + + cancel: + /* Remove if completely consumed. */ + spin_lock(&wreq->lock); + + remove = front; + list_del_init(&front->rreq_link); + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + stream->front = front; + if (!front) { + unsigned long long jump_to = atomic64_read(&wreq->issued_to); + + if (stream->collected_to < jump_to) { + trace_netfs_collect_gap(wreq, stream, jump_to, 'A'); + stream->collected_to = jump_to; + } + } + + spin_unlock(&wreq->lock); + netfs_put_subrequest(remove, false, + notes & SAW_FAILURE ? + netfs_sreq_trace_put_cancel : + netfs_sreq_trace_put_done); + } + + if (front) + notes &= ~ALL_EMPTY; + else + notes |= SOME_EMPTY; + + if (stream->collected_to < collected_to) + collected_to = stream->collected_to; + } + + if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to) + wreq->collected_to = collected_to; + + /* If we have an empty stream, we need to jump it forward over any gap + * otherwise the collection point will never advance. + * + * Note that the issuer always adds to the stream with the lowest + * so-far submitted start, so if we see two consecutive subreqs in one + * stream with nothing between then in another stream, then the second + * stream has a gap that can be jumped. + */ + if (notes & SOME_EMPTY) { + unsigned long long jump_to = wreq->start + wreq->len; + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active && + stream->front && + stream->front->start < jump_to) + jump_to = stream->front->start; + } + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active && + !stream->front && + stream->collected_to < jump_to) { + trace_netfs_collect_gap(wreq, stream, jump_to, 'B'); + stream->collected_to = jump_to; + } + } + } + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active) + trace_netfs_collect_stream(wreq, stream); + } + + trace_netfs_collect_state(wreq, wreq->collected_to, notes); + + /* Unlock any folios that we have now finished with. */ + if (notes & BUFFERED) { + unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity); + + if (wreq->cleaned_to < clean_to) + netfs_writeback_unlock_folios(wreq, clean_to, ¬es); + } else { + wreq->cleaned_to = wreq->collected_to; + } + + // TODO: Discard encryption buffers + + /* If all streams are discontiguous with the last folio we cleared, we + * may need to skip a set of folios. + */ + if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) { + unsigned long long jump_to = ULLONG_MAX; + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->active && stream->front && + stream->front->start < jump_to) + jump_to = stream->front->start; + } + + trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump); + wreq->contiguity = jump_to; + wreq->cleaned_to = jump_to; + wreq->collected_to = jump_to; + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->collected_to < jump_to) + stream->collected_to = jump_to; + } + //cond_resched(); + notes |= MADE_PROGRESS; + goto reassess_streams; + } + + if (notes & NEED_RETRY) + goto need_retry; + if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { + trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); + clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE); + } + + if (notes & NEED_REASSESS) { + //cond_resched(); + goto reassess_streams; + } + if (notes & MADE_PROGRESS) { + //cond_resched(); + goto reassess_streams; + } + +out: + netfs_put_group_many(wreq->group, wreq->nr_group_rel); + wreq->nr_group_rel = 0; + _leave(" = %x", notes); + return; + +need_retry: + /* Okay... We're going to have to retry one or both streams. Note + * that any partially completed op will have had any wholly transferred + * folios removed from it. + */ + _debug("retry"); + netfs_retry_writes(wreq); + goto out; +} + +/* + * Perform the collection of subrequests, folios and encryption buffers. + */ +void netfs_write_collection_worker(struct work_struct *work) +{ + struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(wreq->inode); + size_t transferred; + int s; + + _enter("R=%x", wreq->debug_id); + + netfs_see_request(wreq, netfs_rreq_trace_see_work); + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_work); + return; + } + + netfs_collect_write_results(wreq); + + /* We're done when the app thread has finished posting subreqs and all + * the queues in all the streams are empty. + */ + if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_work); + return; + } + smp_rmb(); /* Read ALL_QUEUED before lists. */ + + transferred = LONG_MAX; + for (s = 0; s < NR_IO_STREAMS; s++) { + struct netfs_io_stream *stream = &wreq->io_streams[s]; + if (!stream->active) + continue; + if (!list_empty(&stream->subrequests)) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_work); + return; + } + if (stream->transferred < transferred) + transferred = stream->transferred; + } + + /* Okay, declare that all I/O is complete. */ + wreq->transferred = transferred; + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + if (wreq->io_streams[1].active && + wreq->io_streams[1].failed) { + /* Cache write failure doesn't prevent writeback completion + * unless we're in disconnected mode. + */ + ictx->ops->invalidate_cache(wreq); + } + + if (wreq->cleanup) + wreq->cleanup(wreq); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + /* mmap may have got underfoot and we may now have folios + * locally covering the region we just wrote. Attempt to + * discard the folios, but leave in place any modified locally. + * ->write_iter() is prevented from interfering by the DIO + * counter. + */ + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + _debug("finished"); + trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (wreq->iocb) { + wreq->iocb->ki_pos += wreq->transferred; + if (wreq->iocb->ki_complete) + wreq->iocb->ki_complete( + wreq->iocb, wreq->error ? wreq->error : wreq->transferred); + wreq->iocb = VFS_PTR_POISON; + } + + netfs_clear_subrequests(wreq, false); + netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete); +} + +/* + * Wake the collection work item. + */ +void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) +{ + if (!work_pending(&wreq->work)) { + netfs_get_request(wreq, netfs_rreq_trace_get_work); + if (!queue_work(system_unbound_wq, &wreq->work)) + netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq); + } +} + +/** + * new_netfs_write_subrequest_terminated - Note the termination of a write operation. + * @_op: The I/O request that has terminated. + * @transferred_or_error: The amount of data transferred or an error code. + * @was_async: The termination was asynchronous + * + * This tells the library that a contributory write I/O operation has + * terminated, one way or another, and that it should collect the results. + * + * The caller indicates in @transferred_or_error the outcome of the operation, + * supplying a positive value to indicate the number of bytes transferred or a + * negative error code. The library will look after reissuing I/O operations + * as appropriate and writing downloaded data to the cache. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + * + * When this is called, ownership of the subrequest is transferred back to the + * library, along with a ref. + * + * Note that %_op is a void* so that the function can be passed to + * kiocb::term_func without the need for a casting wrapper. + */ +void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = _op; + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; + + _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_done); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_done); + break; + case NETFS_INVALID_WRITE: + break; + default: + BUG(); + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + if (subreq->error == -EAGAIN) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + else + set_bit(NETFS_SREQ_FAILED, &subreq->flags); + trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write); + + switch (subreq->source) { + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_failed); + break; + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_failed); + break; + default: + break; + } + trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause); + set_bit(NETFS_RREQ_PAUSE, &wreq->flags); + } else { + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq excess write: R=%x[%x] %zd > %zu - %zu", + wreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + + if (subreq->transferred < subreq->len) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS); + + /* If we are at the head of the queue, wake up the collector, + * transferring a ref to it if we were the ones to do so. + */ + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + netfs_wake_write_collector(wreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} +EXPORT_SYMBOL(new_netfs_write_subrequest_terminated); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c new file mode 100644 index 000000000000..1b2e69934fbd --- /dev/null +++ b/fs/netfs/write_issue.c @@ -0,0 +1,683 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level (buffered) writeback. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * + * To support network filesystems with local caching, we manage a situation + * that can be envisioned like the following: + * + * +---+---+-----+-----+---+----------+ + * Folios: | | | | | | | + * +---+---+-----+-----+---+----------+ + * + * +------+------+ +----+----+ + * Upload: | | |.....| | | + * (Stream 0) +------+------+ +----+----+ + * + * +------+------+------+------+------+ + * Cache: | | | | | | + * (Stream 1) +------+------+------+------+------+ + * + * Where we have a sequence of folios of varying sizes that we need to overlay + * with multiple parallel streams of I/O requests, where the I/O requests in a + * stream may also be of various sizes (in cifs, for example, the sizes are + * negotiated with the server; in something like ceph, they may represent the + * sizes of storage objects). + * + * The sequence in each stream may contain gaps and noncontiguous subrequests + * may be glued together into single vectored write RPCs. + */ + +#include +#include +#include +#include +#include "internal.h" + +/* + * Kill all dirty folios in the event of an unrecoverable error, starting with + * a locked folio we've already obtained from writeback_iter(). + */ +static void netfs_kill_dirty_pages(struct address_space *mapping, + struct writeback_control *wbc, + struct folio *folio) +{ + int error = 0; + + do { + enum netfs_folio_trace why = netfs_folio_trace_kill; + struct netfs_group *group = NULL; + struct netfs_folio *finfo = NULL; + void *priv; + + priv = folio_detach_private(folio); + if (priv) { + finfo = __netfs_folio_info(priv); + if (finfo) { + /* Kill folio from streaming write. */ + group = finfo->netfs_group; + why = netfs_folio_trace_kill_s; + } else { + group = priv; + if (group == NETFS_FOLIO_COPY_TO_CACHE) { + /* Kill copy-to-cache folio */ + why = netfs_folio_trace_kill_cc; + group = NULL; + } else { + /* Kill folio with group */ + why = netfs_folio_trace_kill_g; + } + } + } + + trace_netfs_folio(folio, why); + + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); + + netfs_put_group(group); + kfree(finfo); + + } while ((folio = writeback_iter(mapping, wbc, folio, &error))); +} + +/* + * Create a write request and set it up appropriately for the origin type. + */ +struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, + struct file *file, + loff_t start, + enum netfs_io_origin origin) +{ + struct netfs_io_request *wreq; + struct netfs_inode *ictx; + + wreq = netfs_alloc_request(mapping, file, start, 0, origin); + if (IS_ERR(wreq)) + return wreq; + + _enter("R=%x", wreq->debug_id); + + ictx = netfs_inode(wreq->inode); + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); + + wreq->contiguity = wreq->start; + wreq->cleaned_to = wreq->start; + INIT_WORK(&wreq->work, netfs_write_collection_worker); + + wreq->io_streams[0].stream_nr = 0; + wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER; + wreq->io_streams[0].prepare_write = ictx->ops->prepare_write; + wreq->io_streams[0].issue_write = ictx->ops->issue_write; + wreq->io_streams[0].collected_to = start; + wreq->io_streams[0].transferred = LONG_MAX; + + wreq->io_streams[1].stream_nr = 1; + wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE; + wreq->io_streams[1].collected_to = start; + wreq->io_streams[1].transferred = LONG_MAX; + if (fscache_resources_valid(&wreq->cache_resources)) { + wreq->io_streams[1].avail = true; + wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq; + wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write; + } + + return wreq; +} + +/** + * netfs_prepare_write_failed - Note write preparation failed + * @subreq: The subrequest to mark + * + * Mark a subrequest to note that preparation for write failed. + */ +void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq) +{ + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed); +} +EXPORT_SYMBOL(netfs_prepare_write_failed); + +/* + * Prepare a write subrequest. We need to allocate a new subrequest + * if we don't have one. + */ +static void netfs_prepare_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_alloc_subrequest(wreq); + subreq->source = stream->source; + subreq->start = start; + subreq->max_len = ULONG_MAX; + subreq->max_nr_segs = INT_MAX; + subreq->stream_nr = stream->stream_nr; + + _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + subreq->max_len = wreq->wsize; + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (stream->prepare_write) + stream->prepare_write(subreq); + + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + + /* We add to the end of the list whilst the collector may be walking + * the list. The collector only goes nextwards and uses the lock to + * remove entries off of the front. + */ + spin_lock(&wreq->lock); + list_add_tail(&subreq->rreq_link, &stream->subrequests); + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { + stream->front = subreq; + if (!stream->active) { + stream->collected_to = stream->front->start; + /* Write list pointers before active flag */ + smp_store_release(&stream->active, true); + } + } + + spin_unlock(&wreq->lock); + + stream->construct = subreq; +} + +/* + * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O + * operation. The operation may be asynchronous and should call + * netfs_write_subrequest_terminated() when complete. + */ +static void netfs_do_issue_write(struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + + _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); + + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + return netfs_write_subrequest_terminated(subreq, subreq->error, false); + + // TODO: Use encrypted buffer + if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) { + subreq->io_iter = wreq->io_iter; + iov_iter_advance(&subreq->io_iter, + subreq->start + subreq->transferred - wreq->start); + iov_iter_truncate(&subreq->io_iter, + subreq->len - subreq->transferred); + } else { + iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + stream->issue_write(subreq); +} + +void netfs_reissue_write(struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq) +{ + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_do_issue_write(stream, subreq); +} + +static void netfs_issue_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) +{ + struct netfs_io_subrequest *subreq = stream->construct; + + if (!subreq) + return; + stream->construct = NULL; + + if (subreq->start + subreq->len > wreq->start + wreq->submitted) + wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start; + netfs_do_issue_write(stream, subreq); +} + +/* + * Add data to the write subrequest, dispatching each as we fill it up or if it + * is discontiguous with the previous. We only fill one part at a time so that + * we can avoid overrunning the credits obtained (cifs) and try to parallelise + * content-crypto preparation with network writes. + */ +int netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof) +{ + struct netfs_io_subrequest *subreq = stream->construct; + size_t part; + + if (!stream->avail) { + _leave("no write"); + return len; + } + + _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); + + if (subreq && start != subreq->start + subreq->len) { + netfs_issue_write(wreq, stream); + subreq = NULL; + } + + if (!stream->construct) + netfs_prepare_write(wreq, stream, start); + subreq = stream->construct; + + part = min(subreq->max_len - subreq->len, len); + _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + subreq->len += part; + subreq->nr_segs++; + + if (subreq->len >= subreq->max_len || + subreq->nr_segs >= subreq->max_nr_segs || + to_eof) { + netfs_issue_write(wreq, stream); + subreq = NULL; + } + + return part; +} + +/* + * Write some of a pending folio data back to the server. + */ +static int netfs_write_folio(struct netfs_io_request *wreq, + struct writeback_control *wbc, + struct folio *folio) +{ + struct netfs_io_stream *upload = &wreq->io_streams[0]; + struct netfs_io_stream *cache = &wreq->io_streams[1]; + struct netfs_io_stream *stream; + struct netfs_group *fgroup; /* TODO: Use this with ceph */ + struct netfs_folio *finfo; + size_t fsize = folio_size(folio), flen = fsize, foff = 0; + loff_t fpos = folio_pos(folio), i_size; + bool to_eof = false, streamw = false; + bool debug = false; + + _enter(""); + + /* netfs_perform_write() may shift i_size around the page or from out + * of the page to beyond it, but cannot move i_size into or through the + * page since we have it locked. + */ + i_size = i_size_read(wreq->inode); + + if (fpos >= i_size) { + /* mmap beyond eof. */ + _debug("beyond eof"); + folio_start_writeback(folio); + folio_unlock(folio); + wreq->nr_group_rel += netfs_folio_written_back(folio); + netfs_put_group_many(wreq->group, wreq->nr_group_rel); + wreq->nr_group_rel = 0; + return 0; + } + + if (fpos + fsize > wreq->i_size) + wreq->i_size = i_size; + + fgroup = netfs_folio_group(folio); + finfo = netfs_folio_info(folio); + if (finfo) { + foff = finfo->dirty_offset; + flen = foff + finfo->dirty_len; + streamw = true; + } + + if (wreq->origin == NETFS_WRITETHROUGH) { + to_eof = false; + if (flen > i_size - fpos) + flen = i_size - fpos; + } else if (flen > i_size - fpos) { + flen = i_size - fpos; + if (!streamw) + folio_zero_segment(folio, flen, fsize); + to_eof = true; + } else if (flen == i_size - fpos) { + to_eof = true; + } + flen -= foff; + + _debug("folio %zx %zx %zx", foff, flen, fsize); + + /* Deal with discontinuities in the stream of dirty pages. These can + * arise from a number of sources: + * + * (1) Intervening non-dirty pages from random-access writes, multiple + * flushers writing back different parts simultaneously and manual + * syncing. + * + * (2) Partially-written pages from write-streaming. + * + * (3) Pages that belong to a different write-back group (eg. Ceph + * snapshots). + * + * (4) Actually-clean pages that were marked for write to the cache + * when they were read. Note that these appear as a special + * write-back group. + */ + if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { + netfs_issue_write(wreq, upload); + } else if (fgroup != wreq->group) { + /* We can't write this page to the server yet. */ + kdebug("wrong group"); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + netfs_issue_write(wreq, upload); + netfs_issue_write(wreq, cache); + return 0; + } + + if (foff > 0) + netfs_issue_write(wreq, upload); + if (streamw) + netfs_issue_write(wreq, cache); + + /* Flip the page to the writeback state and unlock. If we're called + * from write-through, then the page has already been put into the wb + * state. + */ + if (wreq->origin == NETFS_WRITEBACK) + folio_start_writeback(folio); + folio_unlock(folio); + + if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { + if (!fscache_resources_valid(&wreq->cache_resources)) { + trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); + netfs_issue_write(wreq, upload); + netfs_folio_written_back(folio); + return 0; + } + trace_netfs_folio(folio, netfs_folio_trace_store_copy); + } else if (!upload->construct) { + trace_netfs_folio(folio, netfs_folio_trace_store); + } else { + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + } + + /* Move the submission point forward to allow for write-streaming data + * not starting at the front of the page. We don't do write-streaming + * with the cache as the cache requires DIO alignment. + * + * Also skip uploading for data that's been read and just needs copying + * to the cache. + */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + stream->submit_max_len = fsize; + stream->submit_off = foff; + stream->submit_len = flen; + if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || + (stream->source == NETFS_UPLOAD_TO_SERVER && + fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { + stream->submit_off = UINT_MAX; + stream->submit_len = 0; + stream->submit_max_len = 0; + } + } + + /* Attach the folio to one or more subrequests. For a big folio, we + * could end up with thousands of subrequests if the wsize is small - + * but we might need to wait during the creation of subrequests for + * network resources (eg. SMB credits). + */ + for (;;) { + ssize_t part; + size_t lowest_off = ULONG_MAX; + int choose_s = -1; + + /* Always add to the lowest-submitted stream first. */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->submit_len > 0 && + stream->submit_off < lowest_off) { + lowest_off = stream->submit_off; + choose_s = s; + } + } + + if (choose_s < 0) + break; + stream = &wreq->io_streams[choose_s]; + + part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, + stream->submit_len, to_eof); + atomic64_set(&wreq->issued_to, fpos + stream->submit_off); + stream->submit_off += part; + stream->submit_max_len -= part; + if (part > stream->submit_len) + stream->submit_len = 0; + else + stream->submit_len -= part; + if (part > 0) + debug = true; + } + + atomic64_set(&wreq->issued_to, fpos + fsize); + + if (!debug) + kdebug("R=%x: No submit", wreq->debug_id); + + if (flen < fsize) + for (int s = 0; s < NR_IO_STREAMS; s++) + netfs_issue_write(wreq, &wreq->io_streams[s]); + + _leave(" = 0"); + return 0; +} + +/* + * Write some of the pending data back to the server + */ +int new_netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct netfs_inode *ictx = netfs_inode(mapping->host); + struct netfs_io_request *wreq = NULL; + struct folio *folio; + int error = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + mutex_lock(&ictx->wb_lock); + else if (!mutex_trylock(&ictx->wb_lock)) + return 0; + + /* Need the first folio to be able to set up the op. */ + folio = writeback_iter(mapping, wbc, NULL, &error); + if (!folio) + goto out; + + wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK); + if (IS_ERR(wreq)) { + error = PTR_ERR(wreq); + goto couldnt_start; + } + + trace_netfs_write(wreq, netfs_write_trace_writeback); + + do { + _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + + /* It appears we don't have to handle cyclic writeback wrapping. */ + WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); + + if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && + unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { + set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + wreq->netfs_ops->begin_writeback(wreq); + } + + error = netfs_write_folio(wreq, wbc, folio); + if (error < 0) + break; + } while ((folio = writeback_iter(mapping, wbc, folio, &error))); + + for (int s = 0; s < NR_IO_STREAMS; s++) + netfs_issue_write(wreq, &wreq->io_streams[s]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + mutex_unlock(&ictx->wb_lock); + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = %d", error); + return error; + +couldnt_start: + netfs_kill_dirty_pages(mapping, wbc, folio); +out: + mutex_unlock(&ictx->wb_lock); + _leave(" = %d", error); + return error; +} +EXPORT_SYMBOL(new_netfs_writepages); + +/* + * Begin a write operation for writing through the pagecache. + */ +struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len) +{ + struct netfs_io_request *wreq = NULL; + struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp)); + + mutex_lock(&ictx->wb_lock); + + wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, + iocb->ki_pos, NETFS_WRITETHROUGH); + if (IS_ERR(wreq)) { + mutex_unlock(&ictx->wb_lock); + return wreq; + } + + wreq->io_streams[0].avail = true; + trace_netfs_write(wreq, netfs_write_trace_writethrough); + return wreq; +} + +/* + * Advance the state of the write operation used when writing through the + * pagecache. Data has been copied into the pagecache that we need to append + * to the request. If we've added more than wsize then we need to create a new + * subrequest. + */ +int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *folio, size_t copied, bool to_page_end, + struct folio **writethrough_cache) +{ + _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", + wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); + + if (!*writethrough_cache) { + if (folio_test_dirty(folio)) + /* Sigh. mmap. */ + folio_clear_dirty_for_io(folio); + + /* We can make multiple writes to the folio... */ + folio_start_writeback(folio); + if (wreq->len == 0) + trace_netfs_folio(folio, netfs_folio_trace_wthru); + else + trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); + *writethrough_cache = folio; + } + + wreq->len += copied; + if (!to_page_end) + return 0; + + *writethrough_cache = NULL; + return netfs_write_folio(wreq, wbc, folio); +} + +/* + * End a write operation used when writing through the pagecache. + */ +int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + int ret; + + _enter("R=%x", wreq->debug_id); + + if (writethrough_cache) + netfs_write_folio(wreq, wbc, writethrough_cache); + + netfs_issue_write(wreq, &wreq->io_streams[0]); + netfs_issue_write(wreq, &wreq->io_streams[1]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + mutex_unlock(&ictx->wb_lock); + + ret = wreq->error; + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} + +/* + * Write data to the server without going through the pagecache and without + * writing it to the local cache. + */ +int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len) +{ + struct netfs_io_stream *upload = &wreq->io_streams[0]; + ssize_t part; + loff_t start = wreq->start; + int error = 0; + + _enter("%zx", len); + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + while (len) { + // TODO: Prepare content encryption + + _debug("unbuffered %zx", len); + part = netfs_advance_write(wreq, upload, start, len, false); + start += part; + len -= part; + if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); + wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); + } + if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) + break; + } + + netfs_issue_write(wreq, upload); + + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + if (list_empty(&upload->subrequests)) + netfs_wake_write_collector(wreq, false); + + _leave(" = %d", error); + return error; +} diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 88269681d4fc..42dba05a428b 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -64,6 +64,7 @@ struct netfs_inode { #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cache; #endif + struct mutex wb_lock; /* Writeback serialisation */ loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ @@ -71,7 +72,6 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ -#define NETFS_ICTX_NO_WRITE_STREAMING 3 /* Don't engage in write-streaming */ #define NETFS_ICTX_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ }; @@ -126,6 +126,33 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio) return priv; } +/* + * Stream of I/O subrequests going to a particular destination, such as the + * server or the local cache. This is mainly intended for writing where we may + * have to write to multiple destinations concurrently. + */ +struct netfs_io_stream { + /* Submission tracking */ + struct netfs_io_subrequest *construct; /* Op being constructed */ + unsigned int submit_off; /* Folio offset we're submitting from */ + unsigned int submit_len; /* Amount of data left to submit */ + unsigned int submit_max_len; /* Amount I/O can be rounded up to */ + void (*prepare_write)(struct netfs_io_subrequest *subreq); + void (*issue_write)(struct netfs_io_subrequest *subreq); + /* Collection tracking */ + struct list_head subrequests; /* Contributory I/O operations */ + struct netfs_io_subrequest *front; /* Op being collected */ + unsigned long long collected_to; /* Position we've collected results to */ + size_t transferred; /* The amount transferred from this stream */ + enum netfs_io_source source; /* Where to read from/write to */ + unsigned short error; /* Aggregate error for the stream */ + unsigned char stream_nr; /* Index of stream in parent table */ + bool avail; /* T if stream is available */ + bool active; /* T if stream is active */ + bool need_retry; /* T if this stream needs retrying */ + bool failed; /* T if this stream failed */ +}; + /* * Resources required to do operations on a cache. */ @@ -150,13 +177,16 @@ struct netfs_io_subrequest { struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ unsigned long long start; /* Where to start the I/O */ + size_t max_len; /* Maximum size of the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ + unsigned int nr_segs; /* Number of segs in io_iter */ unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ + unsigned char stream_nr; /* I/O stream this belongs to */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ @@ -164,6 +194,11 @@ struct netfs_io_subrequest { #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ +#define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ +#define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ +#define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ +#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ +#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { @@ -194,6 +229,9 @@ struct netfs_io_request { struct netfs_cache_resources cache_resources; struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ + struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ +#define NR_IO_STREAMS 2 //wreq->nr_io_streams + struct netfs_group *group; /* Writeback group being written back */ struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ @@ -203,6 +241,8 @@ struct netfs_io_request { unsigned int rsize; /* Maximum read size (0 for none) */ unsigned int wsize; /* Maximum write size (0 for none) */ atomic_t subreq_counter; /* Next subreq->debug_index */ + unsigned int nr_group_rel; /* Number of refs to release on ->group */ + spinlock_t lock; /* Lock for queuing subreqs */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t upper_len; /* Length can be extended to here */ @@ -214,6 +254,10 @@ struct netfs_io_request { bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ + atomic64_t issued_to; /* Write issuer folio cursor */ + unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ + unsigned long long collected_to; /* Point we've collected to */ + unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ refcount_t ref; unsigned long flags; @@ -227,6 +271,9 @@ struct netfs_io_request { #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ +#define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ +#define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ +#define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; @@ -258,6 +305,9 @@ struct netfs_request_ops { /* Write request handling */ void (*create_write_requests)(struct netfs_io_request *wreq, loff_t start, size_t len); + void (*begin_writeback)(struct netfs_io_request *wreq); + void (*prepare_write)(struct netfs_io_subrequest *subreq); + void (*issue_write)(struct netfs_io_subrequest *subreq); void (*invalidate_cache)(struct netfs_io_request *wreq); }; @@ -292,6 +342,9 @@ struct netfs_cache_ops { netfs_io_terminated_t term_func, void *term_func_priv); + /* Write data to the cache from a netfs subrequest. */ + void (*issue_write)(struct netfs_io_subrequest *subreq); + /* Expand readahead request */ void (*expand_readahead)(struct netfs_cache_resources *cres, unsigned long long *_start, @@ -304,6 +357,13 @@ struct netfs_cache_ops { enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq, unsigned long long i_size); + /* Prepare a write subrequest, working out if we're allowed to do it + * and finding out the maximum amount of data to gather before + * attempting to submit. If we're not permitted to do it, the + * subrequest should be marked failed. + */ + void (*prepare_write_subreq)(struct netfs_io_subrequest *subreq); + /* Prepare a write operation, working out what part of the write we can * actually do. */ @@ -349,6 +409,8 @@ int netfs_write_begin(struct netfs_inode *, struct file *, struct folio **, void **fsdata); int netfs_writepages(struct address_space *mapping, struct writeback_control *wbc); +int new_netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); @@ -372,8 +434,11 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, struct netfs_io_subrequest *netfs_create_write_request( struct netfs_io_request *wreq, enum netfs_io_source dest, loff_t start, size_t len, work_func_t worker); +void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, bool was_async); +void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async); void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); @@ -415,6 +480,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif + mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { ctx->zero_point = ctx->remote_i_size; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 7126d2ea459c..e7700172ae7e 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -44,14 +44,18 @@ #define netfs_rreq_traces \ EM(netfs_rreq_trace_assess, "ASSESS ") \ EM(netfs_rreq_trace_copy, "COPY ") \ + EM(netfs_rreq_trace_collect, "COLLECT") \ EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_free, "FREE ") \ EM(netfs_rreq_trace_redirty, "REDIRTY") \ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ + EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ + EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \ EM(netfs_rreq_trace_wake_ip, "WAKE-IP") \ + EM(netfs_rreq_trace_unpause, "UNPAUSE") \ E_(netfs_rreq_trace_write_done, "WR-DONE") #define netfs_sreq_sources \ @@ -64,11 +68,15 @@ E_(NETFS_INVALID_WRITE, "INVL") #define netfs_sreq_traces \ + EM(netfs_sreq_trace_discard, "DSCRD") \ EM(netfs_sreq_trace_download_instead, "RDOWN") \ + EM(netfs_sreq_trace_fail, "FAIL ") \ EM(netfs_sreq_trace_free, "FREE ") \ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ + EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_resubmit_short, "SHORT") \ + EM(netfs_sreq_trace_retry, "RETRY") \ EM(netfs_sreq_trace_submit, "SUBMT") \ EM(netfs_sreq_trace_terminated, "TERM ") \ EM(netfs_sreq_trace_write, "WRITE") \ @@ -88,6 +96,7 @@ #define netfs_rreq_ref_traces \ EM(netfs_rreq_trace_get_for_outstanding,"GET OUTSTND") \ EM(netfs_rreq_trace_get_subreq, "GET SUBREQ ") \ + EM(netfs_rreq_trace_get_work, "GET WORK ") \ EM(netfs_rreq_trace_put_complete, "PUT COMPLT ") \ EM(netfs_rreq_trace_put_discard, "PUT DISCARD") \ EM(netfs_rreq_trace_put_failed, "PUT FAILED ") \ @@ -95,6 +104,8 @@ EM(netfs_rreq_trace_put_return, "PUT RETURN ") \ EM(netfs_rreq_trace_put_subreq, "PUT SUBREQ ") \ EM(netfs_rreq_trace_put_work, "PUT WORK ") \ + EM(netfs_rreq_trace_put_work_complete, "PUT WORK CP") \ + EM(netfs_rreq_trace_put_work_nq, "PUT WORK NQ") \ EM(netfs_rreq_trace_see_work, "SEE WORK ") \ E_(netfs_rreq_trace_new, "NEW ") @@ -103,11 +114,14 @@ EM(netfs_sreq_trace_get_resubmit, "GET RESUBMIT") \ EM(netfs_sreq_trace_get_short_read, "GET SHORTRD") \ EM(netfs_sreq_trace_new, "NEW ") \ + EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \ EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \ EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \ + EM(netfs_sreq_trace_put_done, "PUT DONE ") \ EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \ EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \ EM(netfs_sreq_trace_put_no_copy, "PUT NO COPY") \ + EM(netfs_sreq_trace_put_oom, "PUT OOM ") \ EM(netfs_sreq_trace_put_wip, "PUT WIP ") \ EM(netfs_sreq_trace_put_work, "PUT WORK ") \ E_(netfs_sreq_trace_put_terminated, "PUT TERM ") @@ -124,7 +138,9 @@ EM(netfs_streaming_filled_page, "mod-streamw-f") \ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ /* The rest are for writeback */ \ + EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ EM(netfs_folio_trace_clear, "clear") \ + EM(netfs_folio_trace_clear_cc, "clear-cc") \ EM(netfs_folio_trace_clear_s, "clear-s") \ EM(netfs_folio_trace_clear_g, "clear-g") \ EM(netfs_folio_trace_copy, "copy") \ @@ -133,16 +149,26 @@ EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ + EM(netfs_folio_trace_kill_cc, "kill-cc") \ + EM(netfs_folio_trace_kill_g, "kill-g") \ + EM(netfs_folio_trace_kill_s, "kill-s") \ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ + EM(netfs_folio_trace_not_under_wback, "!wback") \ EM(netfs_folio_trace_read_gaps, "read-gaps") \ EM(netfs_folio_trace_redirty, "redirty") \ EM(netfs_folio_trace_redirtied, "redirtied") \ EM(netfs_folio_trace_store, "store") \ + EM(netfs_folio_trace_store_copy, "store-copy") \ EM(netfs_folio_trace_store_plus, "store+") \ EM(netfs_folio_trace_wthru, "wthru") \ E_(netfs_folio_trace_wthru_plus, "wthru+") +#define netfs_collect_contig_traces \ + EM(netfs_contig_trace_collect, "Collect") \ + EM(netfs_contig_trace_jump, "-->JUMP-->") \ + E_(netfs_contig_trace_unlock, "Unlock") + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY @@ -159,6 +185,7 @@ enum netfs_failure { netfs_failures } __mode(byte); enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte); enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); enum netfs_folio_trace { netfs_folio_traces } __mode(byte); +enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte); #endif @@ -180,6 +207,7 @@ netfs_failures; netfs_rreq_ref_traces; netfs_sreq_ref_traces; netfs_folio_traces; +netfs_collect_contig_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -413,16 +441,18 @@ TRACE_EVENT(netfs_write_iter, __field(unsigned long long, start ) __field(size_t, len ) __field(unsigned int, flags ) + __field(unsigned int, ino ) ), TP_fast_assign( __entry->start = iocb->ki_pos; __entry->len = iov_iter_count(from); + __entry->ino = iocb->ki_filp->f_inode->i_ino; __entry->flags = iocb->ki_flags; ), - TP_printk("WRITE-ITER s=%llx l=%zx f=%x", - __entry->start, __entry->len, __entry->flags) + TP_printk("WRITE-ITER i=%x s=%llx l=%zx f=%x", + __entry->ino, __entry->start, __entry->len, __entry->flags) ); TRACE_EVENT(netfs_write, @@ -434,6 +464,7 @@ TRACE_EVENT(netfs_write, TP_STRUCT__entry( __field(unsigned int, wreq ) __field(unsigned int, cookie ) + __field(unsigned int, ino ) __field(enum netfs_write_trace, what ) __field(unsigned long long, start ) __field(unsigned long long, len ) @@ -444,18 +475,213 @@ TRACE_EVENT(netfs_write, struct fscache_cookie *__cookie = netfs_i_cookie(__ctx); __entry->wreq = wreq->debug_id; __entry->cookie = __cookie ? __cookie->debug_id : 0; + __entry->ino = wreq->inode->i_ino; __entry->what = what; __entry->start = wreq->start; __entry->len = wreq->len; ), - TP_printk("R=%08x %s c=%08x by=%llx-%llx", + TP_printk("R=%08x %s c=%08x i=%x by=%llx-%llx", __entry->wreq, __print_symbolic(__entry->what, netfs_write_traces), __entry->cookie, + __entry->ino, __entry->start, __entry->start + __entry->len - 1) ); +TRACE_EVENT(netfs_collect, + TP_PROTO(const struct netfs_io_request *wreq), + + TP_ARGS(wreq), + + TP_STRUCT__entry( + __field(unsigned int, wreq ) + __field(unsigned int, len ) + __field(unsigned long long, transferred ) + __field(unsigned long long, start ) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->start = wreq->start; + __entry->len = wreq->len; + __entry->transferred = wreq->transferred; + ), + + TP_printk("R=%08x s=%llx-%llx", + __entry->wreq, + __entry->start + __entry->transferred, + __entry->start + __entry->len) + ); + +TRACE_EVENT(netfs_collect_contig, + TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to, + enum netfs_collect_contig_trace type), + + TP_ARGS(wreq, to, type), + + TP_STRUCT__entry( + __field(unsigned int, wreq) + __field(enum netfs_collect_contig_trace, type) + __field(unsigned long long, contiguity) + __field(unsigned long long, to) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->type = type; + __entry->contiguity = wreq->contiguity; + __entry->to = to; + ), + + TP_printk("R=%08x %llx -> %llx %s", + __entry->wreq, + __entry->contiguity, + __entry->to, + __print_symbolic(__entry->type, netfs_collect_contig_traces)) + ); + +TRACE_EVENT(netfs_collect_sreq, + TP_PROTO(const struct netfs_io_request *wreq, + const struct netfs_io_subrequest *subreq), + + TP_ARGS(wreq, subreq), + + TP_STRUCT__entry( + __field(unsigned int, wreq ) + __field(unsigned int, subreq ) + __field(unsigned int, stream ) + __field(unsigned int, len ) + __field(unsigned int, transferred ) + __field(unsigned long long, start ) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->subreq = subreq->debug_index; + __entry->stream = subreq->stream_nr; + __entry->start = subreq->start; + __entry->len = subreq->len; + __entry->transferred = subreq->transferred; + ), + + TP_printk("R=%08x[%u:%02x] s=%llx t=%x/%x", + __entry->wreq, __entry->stream, __entry->subreq, + __entry->start, __entry->transferred, __entry->len) + ); + +TRACE_EVENT(netfs_collect_folio, + TP_PROTO(const struct netfs_io_request *wreq, + const struct folio *folio, + unsigned long long fend, + unsigned long long collected_to), + + TP_ARGS(wreq, folio, fend, collected_to), + + TP_STRUCT__entry( + __field(unsigned int, wreq ) + __field(unsigned long, index ) + __field(unsigned long long, fend ) + __field(unsigned long long, cleaned_to ) + __field(unsigned long long, collected_to ) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->index = folio->index; + __entry->fend = fend; + __entry->cleaned_to = wreq->cleaned_to; + __entry->collected_to = collected_to; + ), + + TP_printk("R=%08x ix=%05lx r=%llx-%llx t=%llx/%llx", + __entry->wreq, __entry->index, + (unsigned long long)__entry->index * PAGE_SIZE, __entry->fend, + __entry->cleaned_to, __entry->collected_to) + ); + +TRACE_EVENT(netfs_collect_state, + TP_PROTO(const struct netfs_io_request *wreq, + unsigned long long collected_to, + unsigned int notes), + + TP_ARGS(wreq, collected_to, notes), + + TP_STRUCT__entry( + __field(unsigned int, wreq ) + __field(unsigned int, notes ) + __field(unsigned long long, collected_to ) + __field(unsigned long long, cleaned_to ) + __field(unsigned long long, contiguity ) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->notes = notes; + __entry->collected_to = collected_to; + __entry->cleaned_to = wreq->cleaned_to; + __entry->contiguity = wreq->contiguity; + ), + + TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x", + __entry->wreq, __entry->collected_to, + __entry->cleaned_to, __entry->contiguity, + __entry->notes) + ); + +TRACE_EVENT(netfs_collect_gap, + TP_PROTO(const struct netfs_io_request *wreq, + const struct netfs_io_stream *stream, + unsigned long long jump_to, char type), + + TP_ARGS(wreq, stream, jump_to, type), + + TP_STRUCT__entry( + __field(unsigned int, wreq) + __field(unsigned char, stream) + __field(unsigned char, type) + __field(unsigned long long, from) + __field(unsigned long long, to) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->stream = stream->stream_nr; + __entry->from = stream->collected_to; + __entry->to = jump_to; + __entry->type = type; + ), + + TP_printk("R=%08x[%x:] %llx->%llx %c", + __entry->wreq, __entry->stream, + __entry->from, __entry->to, __entry->type) + ); + +TRACE_EVENT(netfs_collect_stream, + TP_PROTO(const struct netfs_io_request *wreq, + const struct netfs_io_stream *stream), + + TP_ARGS(wreq, stream), + + TP_STRUCT__entry( + __field(unsigned int, wreq) + __field(unsigned char, stream) + __field(unsigned long long, collected_to) + __field(unsigned long long, front) + ), + + TP_fast_assign( + __entry->wreq = wreq->debug_id; + __entry->stream = stream->stream_nr; + __entry->collected_to = stream->collected_to; + __entry->front = stream->front ? stream->front->start : UINT_MAX; + ), + + TP_printk("R=%08x[%x:] cto=%llx frn=%llx", + __entry->wreq, __entry->stream, + __entry->collected_to, __entry->front) + ); + #undef EM #undef E_ #endif /* _TRACE_NETFS_H */ -- cgit v1.2.3 From 2df86547b23dabcd02ab000a24ed7813606c269f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 Mar 2024 12:36:05 +0000 Subject: netfs: Cut over to using new writeback code Cut over to using the new writeback code. The old code is #ifdef'd out or otherwise removed from compilation to avoid conflicts and will be removed in a future patch. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org --- fs/9p/vfs_addr.c | 6 ++---- fs/afs/file.c | 3 +-- fs/afs/internal.h | 1 - fs/afs/write.c | 2 ++ fs/netfs/Makefile | 1 - fs/netfs/buffered_write.c | 45 ++++++++++++++++++++++++--------------------- fs/netfs/direct_write.c | 26 ++++++++++++++------------ fs/netfs/internal.h | 21 ++++++--------------- fs/netfs/write_collect.c | 8 ++++---- fs/netfs/write_issue.c | 18 +++++++++--------- include/linux/netfs.h | 9 --------- 11 files changed, 62 insertions(+), 78 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 07d03efdd594..4845e655bc39 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -60,6 +60,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) netfs_write_subrequest_terminated(subreq, len ?: err, false); } +#if 0 // TODO: Remove static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) { struct p9_fid *fid = subreq->rreq->netfs_priv; @@ -91,6 +92,7 @@ static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t sta if (subreq) netfs_queue_write_request(subreq); } +#endif /** * v9fs_issue_read - Issue a read from 9P @@ -121,18 +123,15 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { struct p9_fid *fid; bool writing = (rreq->origin == NETFS_READ_FOR_WRITE || - rreq->origin == NETFS_WRITEBACK || rreq->origin == NETFS_WRITETHROUGH || rreq->origin == NETFS_UNBUFFERED_WRITE || rreq->origin == NETFS_DIO_WRITE); -#if 0 // TODO: Cut over if (rreq->origin == NETFS_WRITEBACK) return 0; /* We don't get the write handle until we find we * have actually dirty data and not just * copy-to-cache data. */ -#endif if (file) { fid = file->private_data; @@ -179,7 +178,6 @@ const struct netfs_request_ops v9fs_req_ops = { .issue_read = v9fs_issue_read, .begin_writeback = v9fs_begin_writeback, .issue_write = v9fs_issue_write, - .create_write_requests = v9fs_create_write_requests, }; const struct address_space_operations v9fs_addr_operations = { diff --git a/fs/afs/file.c b/fs/afs/file.c index db9ebae84fa2..8f983e3ecae7 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -353,7 +353,7 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file) if (file) rreq->netfs_priv = key_get(afs_file_key(file)); rreq->rsize = 256 * 1024; - rreq->wsize = 256 * 1024; + rreq->wsize = 256 * 1024 * 1024; return 0; } @@ -399,7 +399,6 @@ const struct netfs_request_ops afs_req_ops = { .issue_read = afs_issue_read, .update_i_size = afs_update_i_size, .invalidate_cache = afs_netfs_invalidate_cache, - .create_write_requests = afs_create_write_requests, .begin_writeback = afs_begin_writeback, .prepare_write = afs_prepare_write, .issue_write = afs_issue_write, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index dcf0ae0323d3..887245f9336d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1605,7 +1605,6 @@ extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len); /* * xattr.c diff --git a/fs/afs/write.c b/fs/afs/write.c index 34595f482718..35db74627563 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -156,6 +156,7 @@ try_next_key: return afs_put_operation(op); } +#if 0 // TODO: Remove static void afs_upload_to_server(struct netfs_io_subrequest *subreq) { struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); @@ -193,6 +194,7 @@ void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size if (subreq) netfs_queue_write_request(subreq); } +#endif /* * Writeback calls this when it finds a folio that needs uploading. This isn't diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 1eb86e34b5a9..8e6781e0b10b 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -11,7 +11,6 @@ netfs-y := \ main.o \ misc.o \ objects.o \ - output.o \ write_collect.o \ write_issue.o diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 33ea4c20e7e7..ee8d9e3216bd 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -26,8 +26,6 @@ enum netfs_how_to_modify { NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ }; -static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); - static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) { void *priv = folio_get_private(folio); @@ -180,7 +178,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, }; struct netfs_io_request *wreq = NULL; struct netfs_folio *finfo; - struct folio *folio; + struct folio *folio, *writethrough = NULL; enum netfs_how_to_modify howto; enum netfs_folio_trace trace; unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; @@ -209,7 +207,6 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } if (!is_sync_kiocb(iocb)) wreq->iocb = iocb; - wreq->cleanup = netfs_cleanup_buffered_write; netfs_stat(&netfs_n_wh_writethrough); } else { netfs_stat(&netfs_n_wh_buffered_write); @@ -253,6 +250,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, offset = pos & (flen - 1); part = min_t(size_t, flen - offset, part); + /* Wait for writeback to complete. The writeback engine owns + * the info in folio->private and may change it until it + * removes the WB mark. + */ + if (folio_get_private(folio) && + folio_wait_writeback_killable(folio)) { + ret = written ? -EINTR : -ERESTARTSYS; + goto error_folio_unlock; + } + if (signal_pending(current)) { ret = written ? -EINTR : -ERESTARTSYS; goto error_folio_unlock; @@ -327,6 +334,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, maybe_trouble = true; iov_iter_revert(iter, copied); copied = 0; + folio_unlock(folio); goto retry; } netfs_set_group(folio, netfs_group); @@ -382,23 +390,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (likely(!wreq)) { folio_mark_dirty(folio); + folio_unlock(folio); } else { - if (folio_test_dirty(folio)) - /* Sigh. mmap. */ - folio_clear_dirty_for_io(folio); - /* We make multiple writes to the folio... */ - if (!folio_test_writeback(folio)) { - folio_start_writeback(folio); - if (wreq->iter.count == 0) - trace_netfs_folio(folio, netfs_folio_trace_wthru); - else - trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); - } - netfs_advance_writethrough(wreq, copied, - offset + copied == flen); + netfs_advance_writethrough(wreq, &wbc, folio, copied, + offset + copied == flen, + &writethrough); + /* Folio unlocked */ } retry: - folio_unlock(folio); folio_put(folio); folio = NULL; @@ -407,7 +406,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, out: if (unlikely(wreq)) { - ret2 = netfs_end_writethrough(wreq, iocb); + ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); wbc_detach_inode(&wbc); if (ret2 == -EIOCBQUEUED) return ret2; @@ -529,11 +528,13 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr sb_start_pagefault(inode->i_sb); - if (folio_wait_writeback_killable(folio)) + if (folio_lock_killable(folio) < 0) goto out; - if (folio_lock_killable(folio) < 0) + if (folio_wait_writeback_killable(folio)) { + ret = VM_FAULT_LOCKED; goto out; + } /* Can we see a streaming write here? */ if (WARN_ON(!folio_test_uptodate(folio))) { @@ -573,6 +574,7 @@ out: } EXPORT_SYMBOL(netfs_page_mkwrite); +#if 0 // TODO: Remove /* * Kill all the pages in the given range */ @@ -1199,3 +1201,4 @@ out: return ret; } EXPORT_SYMBOL(netfs_writepages); +#endif diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 36b6db504500..608ba6416919 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -34,6 +34,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov unsigned long long start = iocb->ki_pos; unsigned long long end = start + iov_iter_count(iter); ssize_t ret, n; + size_t len = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); _enter(""); @@ -46,13 +47,17 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov _debug("uw %llx-%llx", start, end); - wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, - start, end - start, - iocb->ki_flags & IOCB_DIRECT ? - NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); + wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); if (IS_ERR(wreq)) return PTR_ERR(wreq); + wreq->io_streams[0].avail = true; + trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ? + netfs_write_trace_dio_write : + netfs_write_trace_unbuffered_write)); + { /* If this is an async op and we're not using a bounce buffer, * we have to save the source buffer as the iterator is only @@ -63,7 +68,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov * request. */ if (async || user_backed_iter(iter)) { - n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0); + n = netfs_extract_user_iter(iter, len, &wreq->iter, 0); if (n < 0) { ret = n; goto out; @@ -71,7 +76,6 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; wreq->direct_bv_count = n; wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); - wreq->len = iov_iter_count(&wreq->iter); } else { wreq->iter = *iter; } @@ -79,6 +83,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov wreq->io_iter = wreq->iter; } + __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); + /* Copy the data into the bounce buffer and encrypt it. */ // TODO @@ -87,10 +93,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov if (async) wreq->iocb = iocb; wreq->cleanup = netfs_cleanup_dio_write; - ret = netfs_begin_write(wreq, is_sync_kiocb(iocb), - iocb->ki_flags & IOCB_DIRECT ? - netfs_write_trace_dio_write : - netfs_write_trace_unbuffered_write); + ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter)); if (ret < 0) { _debug("begin = %zd", ret); goto out; @@ -100,9 +103,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - + smp_rmb(); /* Read error/transferred after RIP flag */ ret = wreq->error; - _debug("waited = %zd", ret); if (ret == 0) { ret = wreq->transferred; iocb->ki_pos += ret; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 5d3f74a70fa7..95e281a8af78 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -92,15 +92,6 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); } -/* - * output.c - */ -int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, - enum netfs_write_trace what); -struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); -int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end); -int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); - /* * stats.c */ @@ -172,12 +163,12 @@ void netfs_reissue_write(struct netfs_io_stream *stream, int netfs_advance_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream, loff_t start, size_t len, bool to_eof); -struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len); -int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *folio, size_t copied, bool to_page_end, - struct folio **writethrough_cache); -int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache); +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *folio, size_t copied, bool to_page_end, + struct folio **writethrough_cache); +int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache); int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index b8c1d3ca724a..f14c08bf605d 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -709,7 +709,7 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) } /** - * new_netfs_write_subrequest_terminated - Note the termination of a write operation. + * netfs_write_subrequest_terminated - Note the termination of a write operation. * @_op: The I/O request that has terminated. * @transferred_or_error: The amount of data transferred or an error code. * @was_async: The termination was asynchronous @@ -731,8 +731,8 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * Note that %_op is a void* so that the function can be passed to * kiocb::term_func without the need for a casting wrapper. */ -void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async) +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) { struct netfs_io_subrequest *subreq = _op; struct netfs_io_request *wreq = subreq->rreq; @@ -800,4 +800,4 @@ void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_err netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); } -EXPORT_SYMBOL(new_netfs_write_subrequest_terminated); +EXPORT_SYMBOL(netfs_write_subrequest_terminated); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 7ea86e33382c..e190043bc0da 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -494,8 +494,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq, /* * Write some of the pending data back to the server */ -int new_netfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { struct netfs_inode *ictx = netfs_inode(mapping->host); struct netfs_io_request *wreq = NULL; @@ -556,12 +556,12 @@ out: _leave(" = %d", error); return error; } -EXPORT_SYMBOL(new_netfs_writepages); +EXPORT_SYMBOL(netfs_writepages); /* * Begin a write operation for writing through the pagecache. */ -struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len) +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) { struct netfs_io_request *wreq = NULL; struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp)); @@ -586,9 +586,9 @@ struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t * to the request. If we've added more than wsize then we need to create a new * subrequest. */ -int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *folio, size_t copied, bool to_page_end, - struct folio **writethrough_cache) +int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *folio, size_t copied, bool to_page_end, + struct folio **writethrough_cache) { _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); @@ -618,8 +618,8 @@ int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeba /* * End a write operation used when writing through the pagecache. */ -int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache) +int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache) { struct netfs_inode *ictx = netfs_inode(wreq->inode); int ret; diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 42dba05a428b..c2ba364041b0 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -303,8 +303,6 @@ struct netfs_request_ops { void (*update_i_size)(struct inode *inode, loff_t i_size); /* Write request handling */ - void (*create_write_requests)(struct netfs_io_request *wreq, - loff_t start, size_t len); void (*begin_writeback)(struct netfs_io_request *wreq); void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); @@ -409,8 +407,6 @@ int netfs_write_begin(struct netfs_inode *, struct file *, struct folio **, void **fsdata); int netfs_writepages(struct address_space *mapping, struct writeback_control *wbc); -int new_netfs_writepages(struct address_space *mapping, - struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); @@ -431,14 +427,9 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, iov_iter_extraction_t extraction_flags); size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); -struct netfs_io_subrequest *netfs_create_write_request( - struct netfs_io_request *wreq, enum netfs_io_source dest, - loff_t start, size_t len, work_func_t worker); void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, bool was_async); -void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async); void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); -- cgit v1.2.3 From 1ecb146f7cd82e44277de448d4f736b98741f3cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Mar 2024 15:15:44 +0000 Subject: netfs, afs: Use writeback retry to deal with alternate keys Use a hook in the new writeback code's retry algorithm to rotate the keys once all the outstanding subreqs have failed rather than doing it separately on each subreq. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org --- fs/afs/file.c | 1 + fs/afs/internal.h | 1 + fs/afs/write.c | 191 +++++++++++++++++++++++------------------------ fs/netfs/write_collect.c | 9 ++- include/linux/netfs.h | 2 + 5 files changed, 104 insertions(+), 100 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/afs/file.c b/fs/afs/file.c index 8f983e3ecae7..c3f0c45ae9a9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -368,6 +368,7 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, static void afs_free_request(struct netfs_io_request *rreq) { key_put(rreq->netfs_priv); + afs_put_wb_key(rreq->netfs_priv2); } static void afs_update_i_size(struct inode *inode, loff_t new_i_size) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 887245f9336d..6e1d3c4daf72 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1601,6 +1601,7 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); void afs_prepare_write(struct netfs_io_subrequest *subreq); void afs_issue_write(struct netfs_io_subrequest *subreq); void afs_begin_writeback(struct netfs_io_request *wreq); +void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); diff --git a/fs/afs/write.c b/fs/afs/write.c index b8505a8b622a..e959640694c2 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -29,43 +29,39 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign /* * Find a key to use for the writeback. We cached the keys used to author the - * writes on the vnode. *_wbk will contain the last writeback key used or NULL - * and we need to start from there if it's set. + * writes on the vnode. wreq->netfs_priv2 will contain the last writeback key + * record used or NULL and we need to start from there if it's set. + * wreq->netfs_priv will be set to the key itself or NULL. */ -static int afs_get_writeback_key(struct afs_vnode *vnode, - struct afs_wb_key **_wbk) +static void afs_get_writeback_key(struct netfs_io_request *wreq) { - struct afs_wb_key *wbk = NULL; - struct list_head *p; - int ret = -ENOKEY, ret2; + struct afs_wb_key *wbk, *old = wreq->netfs_priv2; + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); + + key_put(wreq->netfs_priv); + wreq->netfs_priv = NULL; + wreq->netfs_priv2 = NULL; spin_lock(&vnode->wb_lock); - if (*_wbk) - p = (*_wbk)->vnode_link.next; + if (old) + wbk = list_next_entry(old, vnode_link); else - p = vnode->wb_keys.next; + wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link); - while (p != &vnode->wb_keys) { - wbk = list_entry(p, struct afs_wb_key, vnode_link); + list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) { _debug("wbk %u", key_serial(wbk->key)); - ret2 = key_validate(wbk->key); - if (ret2 == 0) { + if (key_validate(wbk->key) == 0) { refcount_inc(&wbk->usage); + wreq->netfs_priv = key_get(wbk->key); + wreq->netfs_priv2 = wbk; _debug("USE WB KEY %u", key_serial(wbk->key)); break; } - - wbk = NULL; - if (ret == -ENOKEY) - ret = ret2; - p = p->next; } spin_unlock(&vnode->wb_lock); - if (*_wbk) - afs_put_wb_key(*_wbk); - *_wbk = wbk; - return 0; + + afs_put_wb_key(old); } static void afs_store_data_success(struct afs_operation *op) @@ -88,72 +84,91 @@ static const struct afs_operation_ops afs_store_data_operation = { }; /* - * write to a file + * Prepare a subrequest to write to the server. This sets the max_len + * parameter. */ -static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos) +void afs_prepare_write(struct netfs_io_subrequest *subreq) { + //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) + // subreq->max_len = 512 * 1024; + //else + subreq->max_len = 256 * 1024 * 1024; +} + +/* + * Issue a subrequest to write to the server. + */ +static void afs_issue_write_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); + struct netfs_io_request *wreq = subreq->rreq; struct afs_operation *op; - struct afs_wb_key *wbk = NULL; - loff_t size = iov_iter_count(iter); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); + unsigned long long pos = subreq->start + subreq->transferred; + size_t len = subreq->len - subreq->transferred; int ret = -ENOKEY; - _enter("%s{%llx:%llu.%u},%llx,%llx", + _enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx", + wreq->debug_id, subreq->debug_index, vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - size, pos); + pos, len); - ret = afs_get_writeback_key(vnode, &wbk); - if (ret) { - _leave(" = %d [no keys]", ret); - return ret; - } +#if 0 // Error injection + if (subreq->debug_index == 3) + return netfs_write_subrequest_terminated(subreq, -ENOANO, false); - op = afs_alloc_operation(wbk->key, vnode->volume); - if (IS_ERR(op)) { - afs_put_wb_key(wbk); - return -ENOMEM; + if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); } +#endif + + op = afs_alloc_operation(wreq->netfs_priv, vnode->volume); + if (IS_ERR(op)) + return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); afs_op_set_vnode(op, 0, vnode); - op->file[0].dv_delta = 1; + op->file[0].dv_delta = 1; op->file[0].modification = true; - op->store.pos = pos; - op->store.size = size; - op->flags |= AFS_OPERATION_UNINTR; - op->ops = &afs_store_data_operation; + op->store.pos = pos; + op->store.size = len; + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_store_data_operation; -try_next_key: afs_begin_vnode_operation(op); - op->store.write_iter = iter; - op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); - op->mtime = inode_get_mtime(&vnode->netfs.inode); + op->store.write_iter = &subreq->io_iter; + op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size); + op->mtime = inode_get_mtime(&vnode->netfs.inode); afs_wait_for_operation(op); - - switch (afs_op_error(op)) { + ret = afs_put_operation(op); + switch (ret) { case -EACCES: case -EPERM: case -ENOKEY: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: - _debug("next"); - - ret = afs_get_writeback_key(vnode, &wbk); - if (ret == 0) { - key_put(op->key); - op->key = key_get(wbk->key); - goto try_next_key; - } + /* If there are more keys we can try, use the retry algorithm + * to rotate the keys. + */ + if (wreq->netfs_priv2) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); break; } - afs_put_wb_key(wbk); - _leave(" = %d", afs_op_error(op)); - return afs_put_operation(op); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false); +} + +void afs_issue_write(struct netfs_io_subrequest *subreq) +{ + subreq->work.func = afs_issue_write_worker; + if (!queue_work(system_unbound_wq, &subreq->work)) + WARN_ON_ONCE(1); } /* @@ -162,52 +177,32 @@ try_next_key: */ void afs_begin_writeback(struct netfs_io_request *wreq) { + afs_get_writeback_key(wreq); wreq->io_streams[0].avail = true; } /* - * Prepare a subrequest to write to the server. This sets the max_len - * parameter. - */ -void afs_prepare_write(struct netfs_io_subrequest *subreq) -{ - //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) - // subreq->max_len = 512 * 1024; - //else - subreq->max_len = 256 * 1024 * 1024; -} - -/* - * Issue a subrequest to write to the server. + * Prepare to retry the writes in request. Use this to try rotating the + * available writeback keys. */ -static void afs_issue_write_worker(struct work_struct *work) +void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream) { - struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); - struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); - ssize_t ret; - - _enter("%x[%x],%zx", - subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count); - -#if 0 // Error injection - if (subreq->debug_index == 3) - return netfs_write_subrequest_terminated(subreq, -ENOANO, false); + struct netfs_io_subrequest *subreq = + list_first_entry(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); - if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { - set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + switch (subreq->error) { + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + afs_get_writeback_key(wreq); + if (!wreq->netfs_priv) + stream->failed = true; + break; } -#endif - - ret = afs_store_data(vnode, &subreq->io_iter, subreq->start); - netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false); -} - -void afs_issue_write(struct netfs_io_subrequest *subreq) -{ - subreq->work.func = afs_issue_write_worker; - if (!queue_work(system_unbound_wq, &subreq->work)) - WARN_ON_ONCE(1); } /* diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index f14c08bf605d..60112e4b2c5e 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -163,6 +163,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + if (list_empty(&stream->subrequests)) + return; + + if (stream->source == NETFS_UPLOAD_TO_SERVER && + wreq->netfs_ops->retry_request) + wreq->netfs_ops->retry_request(wreq, stream); + if (unlikely(stream->failed)) return; @@ -182,8 +189,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, return; } - if (list_empty(&stream->subrequests)) - return; next = stream->subrequests.next; do { diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c2ba364041b0..298552f5122c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -235,6 +235,7 @@ struct netfs_io_request { struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ + void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; @@ -306,6 +307,7 @@ struct netfs_request_ops { void (*begin_writeback)(struct netfs_io_request *wreq); void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); + void (*retry_request)(struct netfs_io_request *wreq, struct netfs_io_stream *stream); void (*invalidate_cache)(struct netfs_io_request *wreq); }; -- cgit v1.2.3 From 69c3c023af25edb5433a2db824d3e7cc328f0183 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Oct 2023 18:16:15 +0100 Subject: cifs: Implement netfslib hooks Provide implementation of the netfslib hooks that will be used by netfslib to ask cifs to set up and perform operations. Of particular note are (*) cifs_clamp_length() - This is used to negotiate the size of the next subrequest in a read request, taking into account the credit available and the rsize. The credits are attached to the subrequest. (*) cifs_req_issue_read() - This is used to issue a subrequest that has been set up and clamped. (*) cifs_prepare_write() - This prepares to fill a subrequest by picking a channel, reopening the file and requesting credits so that we can set the maximum size of the subrequest and also sets the maximum number of segments if we're doing RDMA. (*) cifs_issue_write() - This releases any unneeded credits and issues an asynchronous data write for the contiguous slice of file covered by the subrequest. This should possibly be folded in to all ->async_writev() ops and that called directly. (*) cifs_begin_writeback() - This gets the cached writable handle through which we do writeback (this does not affect writethrough, unbuffered or direct writes). At this point, cifs is not wired up to actually *use* netfslib; that will be done in a subsequent patch. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 6 + fs/smb/client/Kconfig | 1 + fs/smb/client/cifsfs.c | 2 +- fs/smb/client/cifsfs.h | 1 + fs/smb/client/cifsglob.h | 28 ++-- fs/smb/client/file.c | 315 +++++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 1 + include/trace/events/netfs.h | 1 + 8 files changed, 345 insertions(+), 10 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 825e6632ee4f..1121601536d1 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -405,6 +405,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter)); out: + if (likely(written) && ctx->ops->post_modify) + ctx->ops->post_modify(inode); + if (unlikely(wreq)) { ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); wbc_detach_inode(&wbc); @@ -521,6 +524,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); + struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_RETRY; int err; @@ -567,6 +571,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr trace_netfs_folio(folio, netfs_folio_trace_mkwrite); netfs_set_group(folio, netfs_group); file_update_time(file); + if (ictx->ops->post_modify) + ictx->ops->post_modify(inode); ret = VM_FAULT_LOCKED; out: sb_end_pagefault(inode->i_sb); diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 2927bd174a88..2517dc242386 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -2,6 +2,7 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET + select NETFS_SUPPORT select NLS select NLS_UCS2_UTILS select CRYPTO diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 763d17870e0b..8f7165567be7 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1758,7 +1758,7 @@ static int cifs_init_netfs(void) { cifs_io_request_cachep = kmem_cache_create("cifs_io_request", - sizeof(struct netfs_io_request), 0, + sizeof(struct cifs_io_request), 0, SLAB_HWCACHE_ALIGN, NULL); if (!cifs_io_request_cachep) goto nomem_req; diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 1acf6bfc06de..922c10d7cfdd 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -84,6 +84,7 @@ extern const struct inode_operations cifs_namespace_inode_operations; /* Functions related to files and directories */ +extern const struct netfs_request_ops cifs_req_ops; extern const struct file_operations cifs_file_ops; extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 611f59c6d2c0..4e9033b2f191 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1491,15 +1491,24 @@ struct cifs_aio_ctx { bool direct_io; }; +struct cifs_io_request { + struct netfs_io_request rreq; + struct cifsFileInfo *cfile; +}; + /* asynchronous read support */ struct cifs_io_subrequest { - struct netfs_io_subrequest subreq; - struct cifsFileInfo *cfile; - struct address_space *mapping; - struct cifs_aio_ctx *ctx; + union { + struct netfs_io_subrequest subreq; + struct netfs_io_request *rreq; + struct cifs_io_request *req; + }; ssize_t got_bytes; pid_t pid; + unsigned int xid; int result; + bool have_xid; + bool replay; struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT @@ -1507,15 +1516,16 @@ struct cifs_io_subrequest { #endif struct cifs_credits credits; - enum writeback_sync_modes sync_mode; - bool uncached; - bool replay; - struct bio_vec *bv; - // TODO: Remove following elements struct list_head list; struct completion done; struct work_struct work; + struct cifsFileInfo *cfile; + struct address_space *mapping; + struct cifs_aio_ctx *ctx; + enum writeback_sync_modes sync_mode; + bool uncached; + struct bio_vec *bv; }; /* diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 331446fc3d22..c9c33b3a54f3 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -36,6 +36,321 @@ #include "fs_context.h" #include "cifs_ioctl.h" #include "cached_dir.h" +#include + +static int cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush); + +/* + * Prepare a subrequest to upload to the server. We need to allocate credits + * so that we know the maximum amount of data that we can include in it. + */ +static void cifs_prepare_write(struct netfs_io_subrequest *subreq) +{ + struct cifs_io_subrequest *wdata = + container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_io_request *req = wdata->req; + struct TCP_Server_Info *server; + struct cifsFileInfo *open_file = req->cfile; + size_t wsize = req->rreq.wsize; + int rc; + + if (!wdata->have_xid) { + wdata->xid = get_xid(); + wdata->have_xid = true; + } + + server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); + wdata->server = server; + +retry: + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, false); + if (rc < 0) { + if (rc == -EAGAIN) + goto retry; + subreq->error = rc; + return netfs_prepare_write_failed(subreq); + } + } + + rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len, + &wdata->credits); + if (rc < 0) { + subreq->error = rc; + return netfs_prepare_write_failed(subreq); + } + +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; +#endif +} + +/* + * Issue a subrequest to upload to the server. + */ +static void cifs_issue_write(struct netfs_io_subrequest *subreq) +{ + struct cifs_io_subrequest *wdata = + container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_sb_info *sbi = CIFS_SB(subreq->rreq->inode->i_sb); + int rc; + + if (cifs_forced_shutdown(sbi)) { + rc = -EIO; + goto fail; + } + + rc = adjust_credits(wdata->server, &wdata->credits, wdata->subreq.len); + if (rc) + goto fail; + + rc = -EAGAIN; + if (wdata->req->cfile->invalidHandle) + goto fail; + + wdata->server->ops->async_writev(wdata); +out: + return; + +fail: + if (rc == -EAGAIN) + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + else + trace_netfs_sreq(subreq, netfs_sreq_trace_fail); + add_credits_and_wake_if(wdata->server, &wdata->credits, 0); + netfs_write_subrequest_terminated(wdata, rc, false); + goto out; +} + +/* + * Split the read up according to how many credits we can get for each piece. + * It's okay to sleep here if we need to wait for more credit to become + * available. + * + * We also choose the server and allocate an operation ID to be cleaned up + * later. + */ +static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct TCP_Server_Info *server; + struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); + size_t rsize = 0; + int rc; + + rdata->xid = get_xid(); + rdata->have_xid = true; + + server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); + rdata->server = server; + + if (cifs_sb->ctx->rsize == 0) + cifs_sb->ctx->rsize = + server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink), + cifs_sb->ctx); + + + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, + &rdata->credits); + if (rc) { + subreq->error = rc; + return false; + } + + subreq->len = min_t(size_t, subreq->len, rsize); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; +#endif + return true; +} + +/* + * Issue a read operation on behalf of the netfs helper functions. We're asked + * to make a read of a certain size at a point in the file. We are permitted + * to only read a portion of that, but as long as we read something, the netfs + * helper will call us again so that we can issue another read. + */ +static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); + struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); + pid_t pid; + int rc = 0; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = req->cfile->pid; + else + pid = current->tgid; // Ummm... This may be a workqueue + + cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", + __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, + subreq->transferred, subreq->len); + + if (req->cfile->invalidHandle) { + do { + rc = cifs_reopen_file(req->cfile, true); + } while (rc == -EAGAIN); + if (rc) + goto out; + } + + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + rdata->pid = pid; + + rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len); + if (!rc) { + if (rdata->req->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = rdata->server->ops->async_readv(rdata); + } + +out: + if (rc) + netfs_subreq_terminated(subreq, rc, false); +} + +/* + * Writeback calls this when it finds a folio that needs uploading. This isn't + * called if writeback only has copy-to-cache to deal with. + */ +static void cifs_begin_writeback(struct netfs_io_request *wreq) +{ + struct cifs_io_request *req = container_of(wreq, struct cifs_io_request, rreq); + int ret; + + ret = cifs_get_writable_file(CIFS_I(wreq->inode), FIND_WR_ANY, &req->cfile); + if (ret) { + cifs_dbg(VFS, "No writable handle in writepages ret=%d\n", ret); + return; + } + + wreq->io_streams[0].avail = true; +} + +/* + * Initialise a request. + */ +static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) +{ + struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq); + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); + struct cifsFileInfo *open_file = NULL; + + rreq->rsize = cifs_sb->ctx->rsize; + rreq->wsize = cifs_sb->ctx->wsize; + + if (file) { + open_file = file->private_data; + rreq->netfs_priv = file->private_data; + req->cfile = cifsFileInfo_get(open_file); + } else if (rreq->origin != NETFS_WRITEBACK) { + WARN_ON_ONCE(1); + return -EIO; + } + + return 0; +} + +/* + * Expand the size of a readahead to the size of the rsize, if at least as + * large as a page, allowing for the possibility that rsize is not pow-2 + * aligned. + */ +static void cifs_expand_readahead(struct netfs_io_request *rreq) +{ + unsigned int rsize = rreq->rsize; + loff_t misalignment, i_size = i_size_read(rreq->inode); + + if (rsize < PAGE_SIZE) + return; + + if (rsize < INT_MAX) + rsize = roundup_pow_of_two(rsize); + else + rsize = ((unsigned int)INT_MAX + 1) / 2; + + misalignment = rreq->start & (rsize - 1); + if (misalignment) { + rreq->start -= misalignment; + rreq->len += misalignment; + } + + rreq->len = round_up(rreq->len, rsize); + if (rreq->start < i_size && rreq->len > i_size - rreq->start) + rreq->len = i_size - rreq->start; +} + +/* + * Completion of a request operation. + */ +static void cifs_rreq_done(struct netfs_io_request *rreq) +{ + struct timespec64 atime, mtime; + struct inode *inode = rreq->inode; + + /* we do not want atime to be less than mtime, it broke some apps */ + atime = inode_set_atime_to_ts(inode, current_time(inode)); + mtime = inode_get_mtime(inode); + if (timespec64_compare(&atime, &mtime)) + inode_set_atime_to_ts(inode, inode_get_mtime(inode)); +} + +static void cifs_post_modify(struct inode *inode) +{ + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); +} + +static void cifs_free_request(struct netfs_io_request *rreq) +{ + struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq); + + if (req->cfile) + cifsFileInfo_put(req->cfile); +} + +static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) +{ + struct cifs_io_subrequest *rdata = + container_of(subreq, struct cifs_io_subrequest, subreq); + int rc = subreq->error; + + if (rdata->subreq.source == NETFS_DOWNLOAD_FROM_SERVER) { +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) { + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif + } + + add_credits_and_wake_if(rdata->server, &rdata->credits, 0); + if (rdata->have_xid) + free_xid(rdata->xid); +} + +const struct netfs_request_ops cifs_req_ops = { + .request_pool = &cifs_io_request_pool, + .subrequest_pool = &cifs_io_subrequest_pool, + .init_request = cifs_init_request, + .free_request = cifs_free_request, + .free_subrequest = cifs_free_subrequest, + .expand_readahead = cifs_expand_readahead, + .clamp_length = cifs_clamp_length, + .issue_read = cifs_req_issue_read, + .done = cifs_rreq_done, + .post_modify = cifs_post_modify, + .begin_writeback = cifs_begin_writeback, + .prepare_write = cifs_prepare_write, + .issue_write = cifs_issue_write, +}; /* * Remove the dirty flags from a span of pages. diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 298552f5122c..f45d06284f2f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -302,6 +302,7 @@ struct netfs_request_ops { /* Modification handling */ void (*update_i_size)(struct inode *inode, loff_t i_size); + void (*post_modify)(struct inode *inode); /* Write request handling */ void (*begin_writeback)(struct netfs_io_request *wreq); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 4ba553a6d71b..da23484268df 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -112,6 +112,7 @@ #define netfs_sreq_ref_traces \ EM(netfs_sreq_trace_get_copy_to_cache, "GET COPY2C ") \ EM(netfs_sreq_trace_get_resubmit, "GET RESUBMIT") \ + EM(netfs_sreq_trace_get_submit, "GET SUBMIT") \ EM(netfs_sreq_trace_get_short_read, "GET SHORTRD") \ EM(netfs_sreq_trace_new, "NEW ") \ EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \ -- cgit v1.2.3 From 14b1cd25346b1d615616a9c2dfdad9b4e6581e0d Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 13 May 2024 17:02:05 -0500 Subject: cifs: Fix locking in cifs_strict_readv() Fix to take the i_rwsem (through the netfs locking wrappers) before taking cinode->lock_sem. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Reported-by: Enzo Matsumiya Signed-off-by: David Howells Signed-off-by: Steve French --- fs/netfs/direct_read.c | 3 ++- fs/smb/client/cifsglob.h | 1 + fs/smb/client/file.c | 34 +++++++++++++++++++++++++--------- include/linux/netfs.h | 1 + 4 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index ad4370b3935d..10a1e4da6bda 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -26,7 +26,7 @@ * * The caller must hold any appropriate locks. */ -static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) +ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) { struct netfs_io_request *rreq; ssize_t ret; @@ -98,6 +98,7 @@ out: iov_iter_revert(iter, orig_count - iov_iter_count(iter)); return ret; } +EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); /** * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 65574e69ba4f..73482734a8d8 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1995,6 +1995,7 @@ require use of the stronger protocol */ * ->chans_need_reconnect * ->chans_in_reconnect * cifs_tcon->tc_lock (anything that is not protected by another lock and can change) + * inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 4c981ce89f8a..9d38294a7e68 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2916,16 +2916,32 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents reading. */ - down_read(&cinode->lock_sem); - if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to), - tcon->ses->server->vals->shared_lock_type, - 0, NULL, CIFS_READ_OP)) { - if (iocb->ki_flags & IOCB_DIRECT) - rc = netfs_unbuffered_read_iter(iocb, to); - else - rc = netfs_buffered_read_iter(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) { + rc = netfs_start_io_direct(inode); + if (rc < 0) + goto out; + down_read(&cinode->lock_sem); + if (!cifs_find_lock_conflict( + cfile, iocb->ki_pos, iov_iter_count(to), + tcon->ses->server->vals->shared_lock_type, + 0, NULL, CIFS_READ_OP)) + rc = netfs_unbuffered_read_iter_locked(iocb, to); + up_read(&cinode->lock_sem); + netfs_end_io_direct(inode); + } else { + rc = netfs_start_io_read(inode); + if (rc < 0) + goto out; + down_read(&cinode->lock_sem); + if (!cifs_find_lock_conflict( + cfile, iocb->ki_pos, iov_iter_count(to), + tcon->ses->server->vals->shared_lock_type, + 0, NULL, CIFS_READ_OP)) + rc = filemap_read(iocb, to, 0); + up_read(&cinode->lock_sem); + netfs_end_io_read(inode); } - up_read(&cinode->lock_sem); +out: return rc; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f45d06284f2f..ca56a4428043 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -389,6 +389,7 @@ struct netfs_cache_ops { }; /* High-level read API. */ +ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); -- cgit v1.2.3 From 16e00683dc74cf1fcdf00046b90852bee05eb94a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 15 May 2024 18:06:03 -0500 Subject: smb3: reenable swapfiles over SMB3 mounts With the changes to folios/netfs it is now easier to reenable swapfile support over SMB3 which fixes various xfstests Reviewed-by: David Howells Suggested-by: David Howells Fixes: e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space") Signed-off-by: Steve French --- fs/netfs/direct_write.c | 3 ++- fs/smb/client/file.c | 23 +++++++++++++++++++++++ include/linux/netfs.h | 2 ++ 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux/netfs.h') diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 608ba6416919..f516460e994e 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -27,7 +27,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. */ -static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, +ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group) { struct netfs_io_request *wreq; @@ -117,6 +117,7 @@ out: netfs_put_request(wreq, false, netfs_rreq_trace_put_return); return ret; } +EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); /** * netfs_unbuffered_write_iter - Unbuffered write to a file diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9d38294a7e68..9d5c2440abfc 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3189,6 +3189,28 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } +/** + * cifs_swap_rw - SMB3 address space operation for swap I/O + * @iocb: target I/O control block + * @iter: I/O buffer + * + * Perform IO to the swap-file. This is much like direct IO. + */ +static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t ret; + + WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE); + + if (iov_iter_rw(iter) == READ) + ret = netfs_unbuffered_read_iter_locked(iocb, iter); + else + ret = netfs_unbuffered_write_iter_locked(iocb, iter, NULL); + if (ret < 0) + return ret; + return 0; +} + const struct address_space_operations cifs_addr_ops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, @@ -3204,6 +3226,7 @@ const struct address_space_operations cifs_addr_ops = { */ .swap_activate = cifs_swap_activate, .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; /* diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043..d2d291a9cdad 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -400,6 +400,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, struct netfs_group *netfs_group); ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); +ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group); ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Address operations API */ -- cgit v1.2.3 From f89ea63f1c65d3e93b255f14f9d9e05df87955fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 May 2024 15:26:11 +0100 Subject: netfs, 9p: Fix race between umount and async request completion There's a problem in 9p's interaction with netfslib whereby a crash occurs because the 9p_fid structs get forcibly destroyed during client teardown (without paying attention to their refcounts) before netfslib has finished with them. However, it's not a simple case of deferring the clunking that p9_fid_put() does as that requires the p9_client record to still be present. The problem is that netfslib has to unlock pages and clear the IN_PROGRESS flag before destroying the objects involved - including the fid - and, in any case, nothing checks to see if writeback completed barring looking at the page flags. Fix this by keeping a count of outstanding I/O requests (of any type) and waiting for it to quiesce during inode eviction. Reported-by: syzbot+df038d463cca332e8414@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005be0aa061846f8d6@google.com/ Reported-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000b86c5e06130da9c6@google.com/ Reported-by: syzbot+1527696d41a634cc1819@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000041f960618206d7e@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/755891.1716560771@warthog.procyon.org.uk Tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Reviewed-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Jeff Layton cc: Steve French cc: Hillf Danton cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-and-tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_inode.c | 1 + fs/afs/inode.c | 1 + fs/netfs/objects.c | 5 +++++ fs/smb/client/cifsfs.c | 1 + include/linux/netfs.h | 18 ++++++++++++++++++ 5 files changed, 26 insertions(+) (limited to 'include/linux/netfs.h') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d77606..fd72fc38c8f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff58..15bb7989c387 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b1650..f4a642727479 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ec5b639f421a..14810ffd15c8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043..3b22ce0d064c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -68,6 +68,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ + atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -472,6 +473,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; + atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -515,4 +517,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) #endif } +/** + * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete + * @ctx: The netfs inode to wait on + * + * Wait for outstanding I/O requests of any type to complete. This is intended + * to be called from inode eviction routines. This makes sure that any + * resources held by those requests are cleaned up before we let the inode get + * cleaned up. + */ +static inline void netfs_wait_for_outstanding_io(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); +} + #endif /* _LINUX_NETFS_H */ -- cgit v1.2.3