summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2024-06-19 02:20:42 +0300
committerChristian Brauner <brauner@kernel.org>2024-09-12 13:20:21 +0300
commitdb0aa2e9566fda2d23dc8f6c102856ead95578a4 (patch)
treed39f09396b198bfe21a67cc96d419cda12c3fa7e /include/linux
parent22de489d1e9d6c30bf9554a9c58bf1dc8d951f7d (diff)
downloadlinux-db0aa2e9566fda2d23dc8f6c102856ead95578a4.tar.xz
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a list of folio_queue structures to be used to provide a buffer to iov_iter-taking functions, such as sendmsg and recvmsg. The folio_queue structure looks like: struct folio_queue { struct folio_batch vec; u8 orders[PAGEVEC_SIZE]; struct folio_queue *next; struct folio_queue *prev; unsigned long marks; unsigned long marks2; }; It does not use a list_head so that next and/or prev can be set to NULL at the ends of the list, allowing iov_iter-handling routines to determine that they *are* the ends without needing to store a head pointer in the iov_iter struct. A folio_batch struct is used to hold the folio pointers which allows the batch to be passed to batch handling functions. Two mark bits are available per slot. The intention is to use at least one of them to mark folios that need putting, but that might not be ultimately necessary. Accessor functions are used to access the slots to do the masking and an additional accessor function is used to indicate the size of the array. The order of each folio is also stored in the structure to avoid the need for iov_iter_advance() and iov_iter_revert() to have to query each folio to find its size. With careful barriering, this can be used as an extending buffer with new folios inserted and new folio_queue structs added without the need for a lock. Further, provided we always keep at least one struct in the buffer, we can also remove consumed folios and consumed structs from the head end as we without the need for locks. [Questions/thoughts] (1) To manage this, I need a head pointer, a tail pointer, a tail slot number (assuming insertion happens at the tail end and the next pointers point from head to tail). Should I put these into a struct of their own, say "folio_queue_head" or "rolling_buffer"? I will end up with two of these in netfs_io_request eventually, one keeping track of the pagecache I'm dealing with for buffered I/O and the other to hold a bounce buffer when we need one. (2) Should I make the slots {folio,off,len} or bio_vec? (3) This is intended to replace ITER_XARRAY eventually. Using an xarray in I/O iteration requires the taking of the RCU read lock, doing copying under the RCU read lock, walking the xarray (which may change under us), handling retries and dealing with special values. The advantage of ITER_XARRAY is that when we're dealing with the pagecache directly, we don't need any allocation - but if we're doing encrypted comms, there's a good chance we'd be using a bounce buffer anyway. This will require afs, erofs, cifs, orangefs and fscache to be converted to not use this. afs still uses it for dirs and symlinks; some of erofs usages should be easy to change, but there's one which won't be so easy; ceph's use via fscache can be fixed by porting ceph to netfslib; cifs is using xarray as a bounce buffer - that can be moved to use sheaves instead; and orangefs has a similar problem to erofs - maybe orangefs could use netfslib? Signed-off-by: David Howells <dhowells@redhat.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: Steve French <sfrench@samba.org> cc: Ilya Dryomov <idryomov@gmail.com> cc: Gao Xiang <xiang@kernel.org> cc: Mike Marshall <hubcap@omnibond.com> cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-erofs@lists.ozlabs.org cc: devel@lists.orangefs.org Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/folio_queue.h138
-rw-r--r--include/linux/iov_iter.h57
-rw-r--r--include/linux/uio.h12
3 files changed, 207 insertions, 0 deletions
diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
new file mode 100644
index 000000000000..52773613bf23
--- /dev/null
+++ b/include/linux/folio_queue.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Queue of folios definitions
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_FOLIO_QUEUE_H
+#define _LINUX_FOLIO_QUEUE_H
+
+#include <linux/pagevec.h>
+
+/*
+ * Segment in a queue of running buffers. Each segment can hold a number of
+ * folios and a portion of the queue can be referenced with the ITER_FOLIOQ
+ * iterator. The possibility exists of inserting non-folio elements into the
+ * queue (such as gaps).
+ *
+ * Explicit prev and next pointers are used instead of a list_head to make it
+ * easier to add segments to tail and remove them from the head without the
+ * need for a lock.
+ */
+struct folio_queue {
+ struct folio_batch vec; /* Folios in the queue segment */
+ u8 orders[PAGEVEC_SIZE]; /* Order of each folio */
+ struct folio_queue *next; /* Next queue segment or NULL */
+ struct folio_queue *prev; /* Previous queue segment of NULL */
+ unsigned long marks; /* 1-bit mark per folio */
+ unsigned long marks2; /* Second 1-bit mark per folio */
+#if PAGEVEC_SIZE > BITS_PER_LONG
+#error marks is not big enough
+#endif
+};
+
+static inline void folioq_init(struct folio_queue *folioq)
+{
+ folio_batch_init(&folioq->vec);
+ folioq->next = NULL;
+ folioq->prev = NULL;
+ folioq->marks = 0;
+ folioq->marks2 = 0;
+}
+
+static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
+{
+ return PAGEVEC_SIZE;
+}
+
+static inline unsigned int folioq_count(struct folio_queue *folioq)
+{
+ return folio_batch_count(&folioq->vec);
+}
+
+static inline bool folioq_full(struct folio_queue *folioq)
+{
+ //return !folio_batch_space(&folioq->vec);
+ return folioq_count(folioq) >= folioq_nr_slots(folioq);
+}
+
+static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot)
+{
+ return test_bit(slot, &folioq->marks);
+}
+
+static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot)
+{
+ set_bit(slot, &folioq->marks);
+}
+
+static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot)
+{
+ clear_bit(slot, &folioq->marks);
+}
+
+static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot)
+{
+ return test_bit(slot, &folioq->marks2);
+}
+
+static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot)
+{
+ set_bit(slot, &folioq->marks2);
+}
+
+static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
+{
+ clear_bit(slot, &folioq->marks2);
+}
+
+static inline unsigned int __folio_order(struct folio *folio)
+{
+ if (!folio_test_large(folio))
+ return 0;
+ return folio->_flags_1 & 0xff;
+}
+
+static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio)
+{
+ unsigned int slot = folioq->vec.nr++;
+
+ folioq->vec.folios[slot] = folio;
+ folioq->orders[slot] = __folio_order(folio);
+ return slot;
+}
+
+static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio)
+{
+ unsigned int slot = folioq->vec.nr++;
+
+ folioq->vec.folios[slot] = folio;
+ folioq->orders[slot] = __folio_order(folio);
+ folioq_mark(folioq, slot);
+ return slot;
+}
+
+static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot)
+{
+ return folioq->vec.folios[slot];
+}
+
+static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot)
+{
+ return folioq->orders[slot];
+}
+
+static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot)
+{
+ return PAGE_SIZE << folioq_folio_order(folioq, slot);
+}
+
+static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
+{
+ folioq->vec.folios[slot] = NULL;
+ folioq_unmark(folioq, slot);
+ folioq_unmark2(folioq, slot);
+}
+
+#endif /* _LINUX_FOLIO_QUEUE_H */
diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
index 270454a6703d..a223370a59a7 100644
--- a/include/linux/iov_iter.h
+++ b/include/linux/iov_iter.h
@@ -10,6 +10,7 @@
#include <linux/uio.h>
#include <linux/bvec.h>
+#include <linux/folio_queue.h>
typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
void *priv, void *priv2);
@@ -141,6 +142,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
}
/*
+ * Handle ITER_FOLIOQ.
+ */
+static __always_inline
+size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+ iov_step_f step)
+{
+ const struct folio_queue *folioq = iter->folioq;
+ unsigned int slot = iter->folioq_slot;
+ size_t progress = 0, skip = iter->iov_offset;
+
+ if (slot == folioq_nr_slots(folioq)) {
+ /* The iterator may have been extended. */
+ folioq = folioq->next;
+ slot = 0;
+ }
+
+ do {
+ struct folio *folio = folioq_folio(folioq, slot);
+ size_t part, remain, consumed;
+ size_t fsize;
+ void *base;
+
+ if (!folio)
+ break;
+
+ fsize = folioq_folio_size(folioq, slot);
+ base = kmap_local_folio(folio, skip);
+ part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
+ remain = step(base, progress, part, priv, priv2);
+ kunmap_local(base);
+ consumed = part - remain;
+ len -= consumed;
+ progress += consumed;
+ skip += consumed;
+ if (skip >= fsize) {
+ skip = 0;
+ slot++;
+ if (slot == folioq_nr_slots(folioq) && folioq->next) {
+ folioq = folioq->next;
+ slot = 0;
+ }
+ }
+ if (remain)
+ break;
+ } while (len);
+
+ iter->folioq_slot = slot;
+ iter->folioq = folioq;
+ iter->iov_offset = skip;
+ iter->count -= progress;
+ return progress;
+}
+
+/*
* Handle ITER_XARRAY.
*/
static __always_inline
@@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
return iterate_bvec(iter, len, priv, priv2, step);
if (iov_iter_is_kvec(iter))
return iterate_kvec(iter, len, priv, priv2, step);
+ if (iov_iter_is_folioq(iter))
+ return iterate_folioq(iter, len, priv, priv2, step);
if (iov_iter_is_xarray(iter))
return iterate_xarray(iter, len, priv, priv2, step);
return iterate_discard(iter, len, priv, priv2, step);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 7020adedfa08..845d110acadc 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -11,6 +11,7 @@
#include <uapi/linux/uio.h>
struct page;
+struct folio_queue;
typedef unsigned int __bitwise iov_iter_extraction_t;
@@ -25,6 +26,7 @@ enum iter_type {
ITER_IOVEC,
ITER_BVEC,
ITER_KVEC,
+ ITER_FOLIOQ,
ITER_XARRAY,
ITER_DISCARD,
};
@@ -66,6 +68,7 @@ struct iov_iter {
const struct iovec *__iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
+ const struct folio_queue *folioq;
struct xarray *xarray;
void __user *ubuf;
};
@@ -74,6 +77,7 @@ struct iov_iter {
};
union {
unsigned long nr_segs;
+ u8 folioq_slot;
loff_t xarray_start;
};
};
@@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i)
return iov_iter_type(i) == ITER_DISCARD;
}
+static inline bool iov_iter_is_folioq(const struct iov_iter *i)
+{
+ return iov_iter_type(i) == ITER_FOLIOQ;
+}
+
static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_XARRAY;
@@ -273,6 +282,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
+void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
+ const struct folio_queue *folioq,
+ unsigned int first_slot, unsigned int offset, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,