summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2023-07-25 02:12:29 +0300
committerDarrick J. Wong <djwong@kernel.org>2023-07-25 02:12:29 +0300
commitd42bd17c6a20638ddf96862bfc0c47e481c28392 (patch)
tree97191965a56afcb9f875fcc8dfd44a1bec80136f /include
parent6eaae198076080886b9e7d57f4ae06fa782f90ef (diff)
parent5d8edfb900d55db7dbf2dbf325bfb9fdb863ec72 (diff)
downloadlinux-d42bd17c6a20638ddf96862bfc0c47e481c28392.tar.xz
Merge tag 'large-folio-writes' of git://git.infradead.org/users/willy/pagecache into iomap-6.6-merge
Create large folios in iomap buffered write path Commit ebb7fb1557b1 limited the length of ioend chains to 4096 entries to improve worst-case latency. Unfortunately, this had the effect of limiting the performance of: fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 \ -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 \ -numjobs=4 -directory=/mnt/test https://lore.kernel.org/linux-xfs/20230508172406.1CF3.409509F4@e16-tech.com/ The problem ends up being lock contention on the i_pages spinlock as we clear the writeback bit on each folio (and propagate that up through the tree). By using larger folios, we decrease the number of folios to be processed by a factor of 256 for this benchmark, eliminating the lock contention. Creating large folios in the buffered write path is also the right thing to do. It's a project that has been on the back burner for years, it just hasn't been important enough to do before now. * tag 'large-folio-writes' of git://git.infradead.org/users/willy/pagecache: iomap: Copy larger chunks from userspace iomap: Create large folios in the buffered write path filemap: Allow __filemap_get_folio to allocate large folios filemap: Add fgf_t typedef iomap: Remove unnecessary test from iomap_release_folio() doc: Correct the description of ->release_folio iomap: Remove large folio handling in iomap_invalidate_folio() iov_iter: Add copy_folio_from_iter_atomic() iov_iter: Handle compound highmem pages in copy_page_from_iter_atomic() iov_iter: Map the page later in copy_page_from_iter_atomic() [djwong: yay amortizations!] Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/iomap.h2
-rw-r--r--include/linux/pagemap.h82
-rw-r--r--include/linux/uio.h9
3 files changed, 80 insertions, 13 deletions
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..80facb9c9e5b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -261,7 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 716953ee1ebd..d87840acbfb2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -470,6 +470,19 @@ static inline void *detach_page_private(struct page *page)
return folio_detach_private(page_folio(page));
}
+/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size. I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER 8
+#endif
+
#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
#else
@@ -501,22 +514,69 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
-#define FGP_ACCESSED 0x00000001
-#define FGP_LOCK 0x00000002
-#define FGP_CREAT 0x00000004
-#define FGP_WRITE 0x00000008
-#define FGP_NOFS 0x00000010
-#define FGP_NOWAIT 0x00000020
-#define FGP_FOR_MMAP 0x00000040
-#define FGP_STABLE 0x00000080
+/**
+ * typedef fgf_t - Flags for getting folios from the page cache.
+ *
+ * Most users of the page cache will not need to use these flags;
+ * there are convenience functions such as filemap_get_folio() and
+ * filemap_lock_folio(). For users which need more control over exactly
+ * what is done with the folios, these flags to __filemap_get_folio()
+ * are available.
+ *
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
+ * * %FGP_CREAT - If no folio is present then a new folio is allocated,
+ * added to the page cache and the VM's LRU list. The folio is
+ * returned locked.
+ * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
+ * folio is already in cache. If the folio was allocated, unlock it
+ * before returning so the caller can do the same dance.
+ * * %FGP_WRITE - The folio will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't block on the folio lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
+ * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
+ * implementation.
+ */
+typedef unsigned int __bitwise fgf_t;
+
+#define FGP_ACCESSED ((__force fgf_t)0x00000001)
+#define FGP_LOCK ((__force fgf_t)0x00000002)
+#define FGP_CREAT ((__force fgf_t)0x00000004)
+#define FGP_WRITE ((__force fgf_t)0x00000008)
+#define FGP_NOFS ((__force fgf_t)0x00000010)
+#define FGP_NOWAIT ((__force fgf_t)0x00000020)
+#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)
+#define FGP_STABLE ((__force fgf_t)0x00000080)
+#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
+/**
+ * fgf_set_order - Encode a length in the fgf_t flags.
+ * @size: The suggested size of the folio to create.
+ *
+ * The caller of __filemap_get_folio() can use this to suggest a preferred
+ * size for the folio that is created. If there is already a folio at
+ * the index, it will be returned, no matter what its size. If a folio
+ * is freshly created, it may be of a different size than requested
+ * due to alignment constraints, memory pressure, or the presence of
+ * other folios at nearby indices.
+ */
+static inline fgf_t fgf_set_order(size_t size)
+{
+ unsigned int shift = ilog2(size);
+
+ if (shift <= PAGE_SHIFT)
+ return 0;
+ return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
+}
+
void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp);
+ fgf_t fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp);
+ fgf_t fgp_flags, gfp_t gfp);
/**
* filemap_get_folio - Find and get a folio.
@@ -590,7 +650,7 @@ static inline struct page *find_get_page(struct address_space *mapping,
}
static inline struct page *find_get_page_flags(struct address_space *mapping,
- pgoff_t offset, int fgp_flags)
+ pgoff_t offset, fgf_t fgp_flags)
{
return pagecache_get_page(mapping, offset, fgp_flags, 0);
}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ff81e5ccaef2..42bce38a8e87 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -163,7 +163,7 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
return ret;
}
-size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
+size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
@@ -184,6 +184,13 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
{
return copy_page_to_iter(&folio->page, offset, bytes, i);
}
+
+static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
+ size_t offset, size_t bytes, struct iov_iter *i)
+{
+ return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
+}
+
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i);