summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2024-07-02 02:40:22 +0300
committerChristian Brauner <brauner@kernel.org>2024-09-12 13:20:41 +0300
commitee4cdf7ba857a894ad1650d6ab77669cbbfa329e (patch)
tree8258e3b756adf109085d66a8b63cd08db03abad0 /include
parent2e45b922977c07bb339d76fd45e68f9b907fef7d (diff)
downloadlinux-ee4cdf7ba857a894ad1650d6ab77669cbbfa329e.tar.xz
netfs: Speed up buffered reading
Improve the efficiency of buffered reads in a number of ways: (1) Overhaul the algorithm in general so that it's a lot more compact and split the read submission code between buffered and unbuffered versions. The unbuffered version can be vastly simplified. (2) Read-result collection is handed off to a work queue rather than being done in the I/O thread. Multiple subrequests can be processes simultaneously. (3) When a subrequest is collected, any folios it fully spans are collected and "spare" data on either side is donated to either the previous or the next subrequest in the sequence. Notes: (*) Readahead expansion is massively slows down fio, presumably because it causes a load of extra allocations, both folio and xarray, up front before RPC requests can be transmitted. (*) RDMA with cifs does appear to work, both with SIW and RXE. (*) PG_private_2-based reading and copy-to-cache is split out into its own file and altered to use folio_queue. Note that the copy to the cache now creates a new write transaction against the cache and adds the folios to be copied into it. This allows it to use part of the writeback I/O code. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-20-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/folio_queue.h18
-rw-r--r--include/linux/netfs.h26
-rw-r--r--include/trace/events/netfs.h103
3 files changed, 134 insertions, 13 deletions
diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index 52773613bf23..955680c3bb5f 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -27,6 +27,7 @@ struct folio_queue {
struct folio_queue *prev; /* Previous queue segment of NULL */
unsigned long marks; /* 1-bit mark per folio */
unsigned long marks2; /* Second 1-bit mark per folio */
+ unsigned long marks3; /* Third 1-bit mark per folio */
#if PAGEVEC_SIZE > BITS_PER_LONG
#error marks is not big enough
#endif
@@ -39,6 +40,7 @@ static inline void folioq_init(struct folio_queue *folioq)
folioq->prev = NULL;
folioq->marks = 0;
folioq->marks2 = 0;
+ folioq->marks3 = 0;
}
static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
@@ -87,6 +89,21 @@ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
clear_bit(slot, &folioq->marks2);
}
+static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot)
+{
+ return test_bit(slot, &folioq->marks3);
+}
+
+static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot)
+{
+ set_bit(slot, &folioq->marks3);
+}
+
+static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot)
+{
+ clear_bit(slot, &folioq->marks3);
+}
+
static inline unsigned int __folio_order(struct folio *folio)
{
if (!folio_test_large(folio))
@@ -133,6 +150,7 @@ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
folioq->vec.folios[slot] = NULL;
folioq_unmark(folioq, slot);
folioq_unmark2(folioq, slot);
+ folioq_unmark3(folioq, slot);
}
#endif /* _LINUX_FOLIO_QUEUE_H */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index bd0e3d147822..c0f0c9c87d86 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -178,36 +178,43 @@ struct netfs_io_subrequest {
unsigned long long start; /* Where to start the I/O */
size_t len; /* Size of the I/O */
size_t transferred; /* Amount of data transferred */
+ size_t consumed; /* Amount of read data consumed */
+ size_t prev_donated; /* Amount of data donated from previous subreq */
+ size_t next_donated; /* Amount of data donated from next subreq */
refcount_t ref;
short error; /* 0 or error that occurred */
unsigned short debug_index; /* Index in list (for debugging output) */
unsigned int nr_segs; /* Number of segs in io_iter */
enum netfs_io_source source; /* Where to read from/write to */
unsigned char stream_nr; /* I/O stream this belongs to */
+ unsigned char curr_folioq_slot; /* Folio currently being read */
+ unsigned char curr_folio_order; /* Order of folio */
+ struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */
unsigned long flags;
#define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */
#define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */
-#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */
#define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */
#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */
#define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */
#define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */
+#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */
#define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */
#define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */
#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */
#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */
-#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */
};
enum netfs_io_origin {
NETFS_READAHEAD, /* This read was triggered by readahead */
NETFS_READPAGE, /* This read is a synchronous read */
+ NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */
NETFS_READ_FOR_WRITE, /* This read is to prepare a write */
NETFS_DIO_READ, /* This is a direct I/O read */
NETFS_WRITEBACK, /* This write was triggered by writepages */
NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */
NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */
NETFS_DIO_WRITE, /* This is a direct I/O write */
+ NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */
nr__netfs_io_origin
} __mode(byte);
@@ -224,6 +231,7 @@ struct netfs_io_request {
struct address_space *mapping; /* The mapping being accessed */
struct kiocb *iocb; /* AIO completion vector */
struct netfs_cache_resources cache_resources;
+ struct readahead_control *ractl; /* Readahead descriptor */
struct list_head proc_link; /* Link in netfs_iorequests */
struct list_head subrequests; /* Contributory I/O operations */
struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */
@@ -244,12 +252,10 @@ struct netfs_io_request {
unsigned int nr_group_rel; /* Number of refs to release on ->group */
spinlock_t lock; /* Lock for queuing subreqs */
atomic_t nr_outstanding; /* Number of ops in progress */
- atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */
- size_t upper_len; /* Length can be extended to here */
unsigned long long submitted; /* Amount submitted for I/O so far */
unsigned long long len; /* Length of the request */
size_t transferred; /* Amount to be indicated as transferred */
- short error; /* 0 or error that occurred */
+ long error; /* 0 or error that occurred */
enum netfs_io_origin origin; /* Origin of the request */
bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */
u8 buffer_head_slot; /* First slot in ->buffer */
@@ -260,9 +266,9 @@ struct netfs_io_request {
unsigned long long collected_to; /* Point we've collected to */
unsigned long long cleaned_to; /* Position we've cleaned folios to */
pgoff_t no_unlock_folio; /* Don't unlock this folio after read */
+ size_t prev_donated; /* Fallback for subreq->prev_donated */
refcount_t ref;
unsigned long flags;
-#define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */
#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */
#define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */
#define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */
@@ -274,6 +280,7 @@ struct netfs_io_request {
#define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */
#define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */
#define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */
+#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */
#define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark
* write to cache on read */
const struct netfs_request_ops *netfs_ops;
@@ -292,7 +299,7 @@ struct netfs_request_ops {
/* Read request handling */
void (*expand_readahead)(struct netfs_io_request *rreq);
- bool (*clamp_length)(struct netfs_io_subrequest *subreq);
+ int (*prepare_read)(struct netfs_io_subrequest *subreq);
void (*issue_read)(struct netfs_io_subrequest *subreq);
bool (*is_still_valid)(struct netfs_io_request *rreq);
int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
@@ -422,7 +429,10 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
/* (Sub)request management API. */
-void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
+ bool was_async);
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
+ int error, bool was_async);
void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what);
void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 58bf23002fc1..7b26463cb98f 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -20,6 +20,7 @@
EM(netfs_read_trace_expanded, "EXPANDED ") \
EM(netfs_read_trace_readahead, "READAHEAD") \
EM(netfs_read_trace_readpage, "READPAGE ") \
+ EM(netfs_read_trace_read_gaps, "READ-GAPS") \
EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \
E_(netfs_read_trace_write_begin, "WRITEBEGN")
@@ -33,12 +34,14 @@
#define netfs_rreq_origins \
EM(NETFS_READAHEAD, "RA") \
EM(NETFS_READPAGE, "RP") \
+ EM(NETFS_READ_GAPS, "RG") \
EM(NETFS_READ_FOR_WRITE, "RW") \
EM(NETFS_DIO_READ, "DR") \
EM(NETFS_WRITEBACK, "WB") \
EM(NETFS_WRITETHROUGH, "WT") \
EM(NETFS_UNBUFFERED_WRITE, "UW") \
- E_(NETFS_DIO_WRITE, "DW")
+ EM(NETFS_DIO_WRITE, "DW") \
+ E_(NETFS_PGPRIV2_COPY_TO_CACHE, "2C")
#define netfs_rreq_traces \
EM(netfs_rreq_trace_assess, "ASSESS ") \
@@ -69,15 +72,25 @@
E_(NETFS_INVALID_WRITE, "INVL")
#define netfs_sreq_traces \
+ EM(netfs_sreq_trace_add_donations, "+DON ") \
+ EM(netfs_sreq_trace_added, "ADD ") \
+ EM(netfs_sreq_trace_clear, "CLEAR") \
EM(netfs_sreq_trace_discard, "DSCRD") \
+ EM(netfs_sreq_trace_donate_to_prev, "DON-P") \
+ EM(netfs_sreq_trace_donate_to_next, "DON-N") \
EM(netfs_sreq_trace_download_instead, "RDOWN") \
EM(netfs_sreq_trace_fail, "FAIL ") \
EM(netfs_sreq_trace_free, "FREE ") \
+ EM(netfs_sreq_trace_hit_eof, "EOF ") \
+ EM(netfs_sreq_trace_io_progress, "IO ") \
EM(netfs_sreq_trace_limited, "LIMIT") \
EM(netfs_sreq_trace_prepare, "PREP ") \
EM(netfs_sreq_trace_prep_failed, "PRPFL") \
- EM(netfs_sreq_trace_resubmit_short, "SHORT") \
+ EM(netfs_sreq_trace_progress, "PRGRS") \
+ EM(netfs_sreq_trace_reprep_failed, "REPFL") \
EM(netfs_sreq_trace_retry, "RETRY") \
+ EM(netfs_sreq_trace_short, "SHORT") \
+ EM(netfs_sreq_trace_split, "SPLIT") \
EM(netfs_sreq_trace_submit, "SUBMT") \
EM(netfs_sreq_trace_terminated, "TERM ") \
EM(netfs_sreq_trace_write, "WRITE") \
@@ -118,7 +131,7 @@
EM(netfs_sreq_trace_new, "NEW ") \
EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \
EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \
- EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \
+ EM(netfs_sreq_trace_put_consumed, "PUT CONSUME") \
EM(netfs_sreq_trace_put_done, "PUT DONE ") \
EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \
EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \
@@ -138,6 +151,7 @@
EM(netfs_flush_content, "flush") \
EM(netfs_streaming_filled_page, "mod-streamw-f") \
EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \
+ EM(netfs_folio_trace_abandon, "abandon") \
EM(netfs_folio_trace_cancel_copy, "cancel-copy") \
EM(netfs_folio_trace_clear, "clear") \
EM(netfs_folio_trace_clear_cc, "clear-cc") \
@@ -154,7 +168,11 @@
EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \
EM(netfs_folio_trace_not_under_wback, "!wback") \
EM(netfs_folio_trace_put, "put") \
+ EM(netfs_folio_trace_read, "read") \
+ EM(netfs_folio_trace_read_done, "read-done") \
EM(netfs_folio_trace_read_gaps, "read-gaps") \
+ EM(netfs_folio_trace_read_put, "read-put") \
+ EM(netfs_folio_trace_read_unlock, "read-unlock") \
EM(netfs_folio_trace_redirtied, "redirtied") \
EM(netfs_folio_trace_store, "store") \
EM(netfs_folio_trace_store_copy, "store-copy") \
@@ -167,6 +185,12 @@
EM(netfs_contig_trace_jump, "-->JUMP-->") \
E_(netfs_contig_trace_unlock, "Unlock")
+#define netfs_donate_traces \
+ EM(netfs_trace_donate_tail_to_prev, "tail-to-prev") \
+ EM(netfs_trace_donate_to_prev, "to-prev") \
+ EM(netfs_trace_donate_to_next, "to-next") \
+ E_(netfs_trace_donate_to_deferred_next, "defer-next")
+
#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
@@ -184,6 +208,7 @@ enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte);
enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
enum netfs_folio_trace { netfs_folio_traces } __mode(byte);
enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte);
+enum netfs_donate_trace { netfs_donate_traces } __mode(byte);
#endif
@@ -206,6 +231,7 @@ netfs_rreq_ref_traces;
netfs_sreq_ref_traces;
netfs_folio_traces;
netfs_collect_contig_traces;
+netfs_donate_traces;
/*
* Now redefine the EM() and E_() macros to map the enums to the strings that
@@ -226,6 +252,7 @@ TRACE_EVENT(netfs_read,
TP_STRUCT__entry(
__field(unsigned int, rreq )
__field(unsigned int, cookie )
+ __field(loff_t, i_size )
__field(loff_t, start )
__field(size_t, len )
__field(enum netfs_read_trace, what )
@@ -235,18 +262,19 @@ TRACE_EVENT(netfs_read,
TP_fast_assign(
__entry->rreq = rreq->debug_id;
__entry->cookie = rreq->cache_resources.debug_id;
+ __entry->i_size = rreq->i_size;
__entry->start = start;
__entry->len = len;
__entry->what = what;
__entry->netfs_inode = rreq->inode->i_ino;
),
- TP_printk("R=%08x %s c=%08x ni=%x s=%llx %zx",
+ TP_printk("R=%08x %s c=%08x ni=%x s=%llx l=%zx sz=%llx",
__entry->rreq,
__print_symbolic(__entry->what, netfs_read_traces),
__entry->cookie,
__entry->netfs_inode,
- __entry->start, __entry->len)
+ __entry->start, __entry->len, __entry->i_size)
);
TRACE_EVENT(netfs_rreq,
@@ -651,6 +679,71 @@ TRACE_EVENT(netfs_collect_stream,
__entry->collected_to, __entry->front)
);
+TRACE_EVENT(netfs_progress,
+ TP_PROTO(const struct netfs_io_subrequest *subreq,
+ unsigned long long start, size_t avail, size_t part),
+
+ TP_ARGS(subreq, start, avail, part),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq)
+ __field(unsigned int, subreq)
+ __field(unsigned int, consumed)
+ __field(unsigned int, transferred)
+ __field(unsigned long long, f_start)
+ __field(unsigned int, f_avail)
+ __field(unsigned int, f_part)
+ __field(unsigned char, slot)
+ ),
+
+ TP_fast_assign(
+ __entry->rreq = subreq->rreq->debug_id;
+ __entry->subreq = subreq->debug_index;
+ __entry->consumed = subreq->consumed;
+ __entry->transferred = subreq->transferred;
+ __entry->f_start = start;
+ __entry->f_avail = avail;
+ __entry->f_part = part;
+ __entry->slot = subreq->curr_folioq_slot;
+ ),
+
+ TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x",
+ __entry->rreq, __entry->subreq, __entry->f_start,
+ __entry->consumed, __entry->transferred,
+ __entry->f_part, __entry->f_avail, __entry->slot)
+ );
+
+TRACE_EVENT(netfs_donate,
+ TP_PROTO(const struct netfs_io_request *rreq,
+ const struct netfs_io_subrequest *from,
+ const struct netfs_io_subrequest *to,
+ size_t amount,
+ enum netfs_donate_trace trace),
+
+ TP_ARGS(rreq, from, to, amount, trace),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq)
+ __field(unsigned int, from)
+ __field(unsigned int, to)
+ __field(unsigned int, amount)
+ __field(enum netfs_donate_trace, trace)
+ ),
+
+ TP_fast_assign(
+ __entry->rreq = rreq->debug_id;
+ __entry->from = from->debug_index;
+ __entry->to = to ? to->debug_index : -1;
+ __entry->amount = amount;
+ __entry->trace = trace;
+ ),
+
+ TP_printk("R=%08x[%02x] -> [%02x] %s am=%x",
+ __entry->rreq, __entry->from, __entry->to,
+ __print_symbolic(__entry->trace, netfs_donate_traces),
+ __entry->amount)
+ );
+
#undef EM
#undef E_
#endif /* _TRACE_NETFS_H */