summaryrefslogtreecommitdiff
path: root/fs/netfs/read_collect.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/netfs/read_collect.c')
-rw-r--r--fs/netfs/read_collect.c544
1 files changed, 544 insertions, 0 deletions
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
new file mode 100644
index 000000000000..b18c65ba5580
--- /dev/null
+++ b/fs/netfs/read_collect.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem read subrequest result collection, assessment and
+ * retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
+{
+ netfs_reset_iter(subreq);
+ WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
+ if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
+ __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
+}
+
+/*
+ * Flush, mark and unlock a folio that's now completely read. If we want to
+ * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
+ * dirty and let writeback handle it.
+ */
+static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
+ struct netfs_io_request *rreq,
+ struct folio_queue *folioq,
+ int slot)
+{
+ struct netfs_folio *finfo;
+ struct folio *folio = folioq_folio(folioq, slot);
+
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+
+ if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
+
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+ folio_mark_dirty(folio);
+ }
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_done);
+ }
+ } else {
+ // TODO: Use of PG_private_2 is deprecated.
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
+ }
+
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+ if (folio->index == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
+ _debug("no unlock");
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
+ folio_unlock(folio);
+ }
+ }
+}
+
+/*
+ * Unlock any folios that are now completely read. Returns true if the
+ * subrequest is removed from the list.
+ */
+static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
+{
+ struct netfs_io_subrequest *prev, *next;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct folio_queue *folioq = subreq->curr_folioq;
+ size_t avail, prev_donated, next_donated, fsize, part, excess;
+ loff_t fpos, start;
+ loff_t fend;
+ int slot = subreq->curr_folioq_slot;
+
+ if (WARN(subreq->transferred > subreq->len,
+ "Subreq overread: R%x[%x] %zu > %zu",
+ rreq->debug_id, subreq->debug_index,
+ subreq->transferred, subreq->len))
+ subreq->transferred = subreq->len;
+
+next_folio:
+ fsize = PAGE_SIZE << subreq->curr_folio_order;
+ fpos = round_down(subreq->start + subreq->consumed, fsize);
+ fend = fpos + fsize;
+
+ if (WARN_ON_ONCE(!folioq) ||
+ WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
+ WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
+ pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->start + subreq->transferred - 1,
+ subreq->consumed, subreq->transferred, subreq->len,
+ slot);
+ if (folioq) {
+ struct folio *folio = folioq_folio(folioq, slot);
+
+ pr_err("folioq: orders=%02x%02x%02x%02x\n",
+ folioq->orders[0], folioq->orders[1],
+ folioq->orders[2], folioq->orders[3]);
+ if (folio)
+ pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
+ fpos, fend - 1, folio_pos(folio), folio_order(folio),
+ folioq_folio_order(folioq, slot));
+ }
+ }
+
+donation_changed:
+ /* Try to consume the current folio if we've hit or passed the end of
+ * it. There's a possibility that this subreq doesn't start at the
+ * beginning of the folio, in which case we need to donate to/from the
+ * preceding subreq.
+ *
+ * We also need to include any potential donation back from the
+ * following subreq.
+ */
+ prev_donated = READ_ONCE(subreq->prev_donated);
+ next_donated = READ_ONCE(subreq->next_donated);
+ if (prev_donated || next_donated) {
+ spin_lock_bh(&rreq->lock);
+ prev_donated = subreq->prev_donated;
+ next_donated = subreq->next_donated;
+ subreq->start -= prev_donated;
+ subreq->len += prev_donated;
+ subreq->transferred += prev_donated;
+ prev_donated = subreq->prev_donated = 0;
+ if (subreq->transferred == subreq->len) {
+ subreq->len += next_donated;
+ subreq->transferred += next_donated;
+ next_donated = subreq->next_donated = 0;
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
+ spin_unlock_bh(&rreq->lock);
+ }
+
+ avail = subreq->transferred;
+ if (avail == subreq->len)
+ avail += next_donated;
+ start = subreq->start;
+ if (subreq->consumed == 0) {
+ start -= prev_donated;
+ avail += prev_donated;
+ } else {
+ start += subreq->consumed;
+ avail -= subreq->consumed;
+ }
+ part = umin(avail, fsize);
+
+ trace_netfs_progress(subreq, start, avail, part);
+
+ if (start + avail >= fend) {
+ if (fpos == start) {
+ /* Flush, unlock and mark for caching any folio we've just read. */
+ subreq->consumed = fend - subreq->start;
+ netfs_unlock_read_folio(subreq, rreq, folioq, slot);
+ folioq_mark2(folioq, slot);
+ if (subreq->consumed >= subreq->len)
+ goto remove_subreq;
+ } else if (fpos < start) {
+ excess = fend - subreq->start;
+
+ spin_lock_bh(&rreq->lock);
+ /* If we complete first on a folio split with the
+ * preceding subreq, donate to that subreq - otherwise
+ * we get the responsibility.
+ */
+ if (subreq->prev_donated != prev_donated) {
+ spin_unlock_bh(&rreq->lock);
+ goto donation_changed;
+ }
+
+ if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
+ spin_unlock_bh(&rreq->lock);
+ pr_err("Can't donate prior to front\n");
+ goto bad;
+ }
+
+ prev = list_prev_entry(subreq, rreq_link);
+ WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
+ subreq->start += excess;
+ subreq->len -= excess;
+ subreq->transferred -= excess;
+ trace_netfs_donate(rreq, subreq, prev, excess,
+ netfs_trace_donate_tail_to_prev);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
+
+ if (subreq->consumed >= subreq->len)
+ goto remove_subreq_locked;
+ spin_unlock_bh(&rreq->lock);
+ } else {
+ pr_err("fpos > start\n");
+ goto bad;
+ }
+
+ /* Advance the rolling buffer to the next folio. */
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ slot = 0;
+ folioq = folioq->next;
+ subreq->curr_folioq = folioq;
+ }
+ subreq->curr_folioq_slot = slot;
+ if (folioq && folioq_folio(folioq, slot))
+ subreq->curr_folio_order = folioq->orders[slot];
+ if (!was_async)
+ cond_resched();
+ goto next_folio;
+ }
+
+ /* Deal with partial progress. */
+ if (subreq->transferred < subreq->len)
+ return false;
+
+ /* Donate the remaining downloaded data to one of the neighbouring
+ * subrequests. Note that we may race with them doing the same thing.
+ */
+ spin_lock_bh(&rreq->lock);
+
+ if (subreq->prev_donated != prev_donated ||
+ subreq->next_donated != next_donated) {
+ spin_unlock_bh(&rreq->lock);
+ cond_resched();
+ goto donation_changed;
+ }
+
+ /* Deal with the trickiest case: that this subreq is in the middle of a
+ * folio, not touching either edge, but finishes first. In such a
+ * case, we donate to the previous subreq, if there is one, so that the
+ * donation is only handled when that completes - and remove this
+ * subreq from the list.
+ *
+ * If the previous subreq finished first, we will have acquired their
+ * donation and should be able to unlock folios and/or donate nextwards.
+ */
+ if (!subreq->consumed &&
+ !prev_donated &&
+ !list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
+ prev = list_prev_entry(subreq, rreq_link);
+ WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
+ subreq->start += subreq->len;
+ subreq->len = 0;
+ subreq->transferred = 0;
+ trace_netfs_donate(rreq, subreq, prev, subreq->len,
+ netfs_trace_donate_to_prev);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
+ goto remove_subreq_locked;
+ }
+
+ /* If we can't donate down the chain, donate up the chain instead. */
+ excess = subreq->len - subreq->consumed + next_donated;
+
+ if (!subreq->consumed)
+ excess += prev_donated;
+
+ if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ rreq->prev_donated = excess;
+ trace_netfs_donate(rreq, subreq, NULL, excess,
+ netfs_trace_donate_to_deferred_next);
+ } else {
+ next = list_next_entry(subreq, rreq_link);
+ WRITE_ONCE(next->prev_donated, excess);
+ trace_netfs_donate(rreq, subreq, next, excess,
+ netfs_trace_donate_to_next);
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
+ subreq->len = subreq->consumed;
+ subreq->transferred = subreq->consumed;
+ goto remove_subreq_locked;
+
+remove_subreq:
+ spin_lock_bh(&rreq->lock);
+remove_subreq_locked:
+ subreq->consumed = subreq->len;
+ list_del(&subreq->rreq_link);
+ spin_unlock_bh(&rreq->lock);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
+ return true;
+
+bad:
+ /* Errr... prev and next both donated to us, but insufficient to finish
+ * the folio.
+ */
+ printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->start + subreq->transferred - 1,
+ subreq->consumed, subreq->transferred, subreq->len);
+ printk("folio: %llx-%llx\n", fpos, fend - 1);
+ printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
+ printk("s=%llx av=%zx part=%zx\n", start, avail, part);
+ BUG();
+}
+
+/*
+ * Do page flushing and suchlike after DIO.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+
+ /* Collect unbuffered reads and direct reads, adding up the transfer
+ * sizes until we find the first short or failed subrequest.
+ */
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ rreq->transferred += subreq->transferred;
+
+ if (subreq->transferred < subreq->len ||
+ test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
+ rreq->error = subreq->error;
+ break;
+ }
+ }
+
+ if (rreq->origin == NETFS_DIO_READ) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+ // TODO: cifs marks pages in the destination buffer
+ // dirty under some circumstances after a read. Do we
+ // need to do that too?
+ set_page_dirty(rreq->direct_bv[i].bv_page);
+ }
+ }
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += rreq->transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_end(rreq->inode);
+}
+
+/*
+ * Assess the state of a read request and decide what to do next.
+ *
+ * Note that we're in normal kernel thread context at this point, possibly
+ * running on a workqueue.
+ */
+static void netfs_rreq_assess(struct netfs_io_request *rreq)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+
+ //netfs_rreq_is_still_valid(rreq);
+
+ if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
+ netfs_retry_reads(rreq);
+ return;
+ }
+
+ if (rreq->origin == NETFS_DIO_READ ||
+ rreq->origin == NETFS_READ_GAPS)
+ netfs_rreq_assess_dio(rreq);
+ task_io_account_read(rreq->transferred);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_clear_subrequests(rreq, false);
+ netfs_unlock_abandoned_read_pages(rreq);
+ if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
+ netfs_pgpriv2_write_to_the_cache(rreq);
+}
+
+void netfs_read_termination_worker(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+ netfs_see_request(rreq, netfs_rreq_trace_see_work);
+ netfs_rreq_assess(rreq);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Handle the completion of all outstanding I/O operations on a read request.
+ * We inherit a ref from the caller.
+ */
+void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
+{
+ if (!was_async)
+ return netfs_rreq_assess(rreq);
+ if (!work_pending(&rreq->work)) {
+ netfs_get_request(rreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
+ }
+}
+
+/**
+ * netfs_read_subreq_progress - Note progress of a read operation.
+ * @subreq: The read request that has terminated.
+ * @was_async: True if we're in an asynchronous context.
+ *
+ * This tells the read side of netfs lib that a contributory I/O operation has
+ * made some progress and that it may be possible to unlock some folios.
+ *
+ * Before calling, the filesystem should update subreq->transferred to track
+ * the amount of data copied into the output buffer.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
+ bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
+
+ if (subreq->transferred > subreq->consumed &&
+ (rreq->origin == NETFS_READAHEAD ||
+ rreq->origin == NETFS_READPAGE ||
+ rreq->origin == NETFS_READ_FOR_WRITE)) {
+ netfs_consume_read_data(subreq, was_async);
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+}
+EXPORT_SYMBOL(netfs_read_subreq_progress);
+
+/**
+ * netfs_read_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ * @error: Error code indicating type of completion.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates the outcome of the operation through @error, supplying
+ * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
+ * is set) or a negative error code. The helper will look after reissuing I/O
+ * operations as appropriate and writing downloaded data to the cache.
+ *
+ * Before calling, the filesystem should update subreq->transferred to track
+ * the amount of data copied into the output buffer.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
+ int error, bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ switch (subreq->source) {
+ case NETFS_READ_FROM_CACHE:
+ netfs_stat(&netfs_n_rh_read_done);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download_done);
+ break;
+ default:
+ break;
+ }
+
+ if (rreq->origin != NETFS_DIO_READ) {
+ /* Collect buffered reads.
+ *
+ * If the read completed validly short, then we can clear the
+ * tail before going on to unlock the folios.
+ */
+ if (error == 0 && subreq->transferred < subreq->len &&
+ (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
+ test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
+ netfs_clear_unread(subreq);
+ subreq->transferred = subreq->len;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
+ }
+ if (subreq->transferred > subreq->consumed &&
+ (rreq->origin == NETFS_READAHEAD ||
+ rreq->origin == NETFS_READPAGE ||
+ rreq->origin == NETFS_READ_FOR_WRITE)) {
+ netfs_consume_read_data(subreq, was_async);
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+ rreq->transferred += subreq->transferred;
+ }
+
+ /* Deal with retry requests, short reads and errors. If we retry
+ * but don't make progress, we abandon the attempt.
+ */
+ if (!error && subreq->transferred < subreq->len) {
+ if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
+ } else {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_short);
+ if (subreq->transferred > subreq->consumed) {
+ __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
+ } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
+ } else {
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ error = -ENODATA;
+ }
+ }
+ }
+
+ subreq->error = error;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ if (unlikely(error < 0)) {
+ trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ netfs_stat(&netfs_n_rh_read_failed);
+ } else {
+ netfs_stat(&netfs_n_rh_download_failed);
+ set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+ rreq->error = subreq->error;
+ }
+ }
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_terminated(rreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_read_subreq_terminated);