summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/iomap/design.rst4
-rw-r--r--block/fops.c2
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/iomap/buffered-io.c199
-rw-r--r--fs/iomap/direct-io.c42
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c15
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_shared.h3
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c12
-rw-r--r--fs/xfs/xfs_mount.c8
-rw-r--r--fs/xfs/xfs_super.c28
-rw-r--r--fs/zonefs/file.c2
-rw-r--r--include/linux/huge_mm.h28
-rw-r--r--include/linux/iomap.h13
-rw-r--r--include/linux/pagemap.h124
-rw-r--r--mm/filemap.c36
-rw-r--r--mm/huge_memory.c65
-rw-r--r--mm/readahead.c83
21 files changed, 504 insertions, 194 deletions
diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
index 37594e1c5914..b0d0188a095e 100644
--- a/Documentation/filesystems/iomap/design.rst
+++ b/Documentation/filesystems/iomap/design.rst
@@ -165,7 +165,7 @@ structure below:
u16 flags;
struct block_device *bdev;
struct dax_device *dax_dev;
- voidw *inline_data;
+ void *inline_data;
void *private;
const struct iomap_folio_ops *folio_ops;
u64 validity_cookie;
@@ -426,7 +426,7 @@ iomap is concerned:
The exact locking requirements are specific to the filesystem; for
certain operations, some of these locks can be elided.
-All further mention of locking are *recommendations*, not mandates.
+All further mentions of locking are *recommendations*, not mandates.
Each filesystem author must figure out the locking for themself.
Bugs and Limitations
diff --git a/block/fops.c b/block/fops.c
index e69deb6d8635..e696ae53bf1e 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -666,7 +666,7 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
- return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
}
/*
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 08982937b5df..f7dd64856c9b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1057,7 +1057,7 @@ retry:
}
pagefault_disable();
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL);
pagefault_enable();
if (ret > 0)
written += ret;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9b4ca3811a24..11ea747228ae 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -23,7 +23,6 @@
#define IOEND_BATCH_SIZE 4096
-typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -1022,13 +1021,14 @@ retry:
ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(i),
.flags = IOMAP_WRITE,
+ .private = private,
};
ssize_t ret;
@@ -1046,15 +1046,14 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+static void iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
unsigned int first_blk, last_blk, i;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
- int ret = 0;
/*
* When we have per-block dirty tracking, there can be
@@ -1064,47 +1063,35 @@ static int iomap_write_delalloc_ifs_punch(struct inode *inode,
*/
ifs = folio->private;
if (!ifs)
- return ret;
+ return;
last_byte = min_t(loff_t, end_byte - 1,
folio_pos(folio) + folio_size(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
for (i = first_blk; i <= last_blk; i++) {
- if (!ifs_block_is_dirty(folio, ifs, i)) {
- ret = punch(inode, folio_pos(folio) + (i << blkbits),
- 1 << blkbits);
- if (ret)
- return ret;
- }
+ if (!ifs_block_is_dirty(folio, ifs, i))
+ punch(inode, folio_pos(folio) + (i << blkbits),
+ 1 << blkbits, iomap);
}
-
- return ret;
}
-
-static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
+static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
- int ret = 0;
-
if (!folio_test_dirty(folio))
- return ret;
+ return;
/* if dirty, punch up to offset */
if (start_byte > *punch_start_byte) {
- ret = punch(inode, *punch_start_byte,
- start_byte - *punch_start_byte);
- if (ret)
- return ret;
+ punch(inode, *punch_start_byte, start_byte - *punch_start_byte,
+ iomap);
}
/* Punch non-dirty blocks within folio */
- ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
- end_byte, punch);
- if (ret)
- return ret;
+ iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte,
+ iomap, punch);
/*
* Make sure the next punch start is correctly bound to
@@ -1112,8 +1099,6 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
*/
*punch_start_byte = min_t(loff_t, end_byte,
folio_pos(folio) + folio_size(folio));
-
- return ret;
}
/*
@@ -1133,13 +1118,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
* simplify range iterations.
*/
-static int iomap_write_delalloc_scan(struct inode *inode,
+static void iomap_write_delalloc_scan(struct inode *inode,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
while (start_byte < end_byte) {
struct folio *folio;
- int ret;
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
@@ -1150,20 +1134,14 @@ static int iomap_write_delalloc_scan(struct inode *inode,
continue;
}
- ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte,
- start_byte, end_byte, punch);
- if (ret) {
- folio_unlock(folio);
- folio_put(folio);
- return ret;
- }
+ iomap_write_delalloc_punch(inode, folio, punch_start_byte,
+ start_byte, end_byte, iomap, punch);
/* move offset to start of next folio in range */
start_byte = folio_next_index(folio) << PAGE_SHIFT;
folio_unlock(folio);
folio_put(folio);
}
- return 0;
}
/*
@@ -1199,12 +1177,12 @@ static int iomap_write_delalloc_scan(struct inode *inode,
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
* the code to subtle off-by-one bugs....
*/
-static int iomap_write_delalloc_release(struct inode *inode,
- loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
+static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+ loff_t end_byte, unsigned flags, struct iomap *iomap,
+ iomap_punch_t punch)
{
loff_t punch_start_byte = start_byte;
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
- int error = 0;
/*
* Lock the mapping to avoid races with page faults re-instantiating
@@ -1221,13 +1199,15 @@ static int iomap_write_delalloc_release(struct inode *inode,
/*
* If there is no more data to scan, all that is left is to
* punch out the remaining range.
+ *
+ * Note that mapping_seek_hole_data is only supposed to return
+ * either an offset or -ENXIO, so WARN on any other error as
+ * that would be an API change without updating the callers.
*/
if (start_byte == -ENXIO || start_byte == scan_end_byte)
break;
- if (start_byte < 0) {
- error = start_byte;
+ if (WARN_ON_ONCE(start_byte < 0))
goto out_unlock;
- }
WARN_ON_ONCE(start_byte < punch_start_byte);
WARN_ON_ONCE(start_byte > scan_end_byte);
@@ -1237,28 +1217,31 @@ static int iomap_write_delalloc_release(struct inode *inode,
*/
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
scan_end_byte, SEEK_HOLE);
- if (data_end < 0) {
- error = data_end;
+ if (WARN_ON_ONCE(data_end < 0))
goto out_unlock;
- }
- WARN_ON_ONCE(data_end <= start_byte);
+
+ /*
+ * If we race with post-direct I/O invalidation of the page cache,
+ * there might be no data left at start_byte.
+ */
+ if (data_end == start_byte)
+ continue;
+
+ WARN_ON_ONCE(data_end < start_byte);
WARN_ON_ONCE(data_end > scan_end_byte);
- error = iomap_write_delalloc_scan(inode, &punch_start_byte,
- start_byte, data_end, punch);
- if (error)
- goto out_unlock;
+ iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte,
+ data_end, iomap, punch);
/* The next data search starts at the end of this one. */
start_byte = data_end;
}
if (punch_start_byte < end_byte)
- error = punch(inode, punch_start_byte,
- end_byte - punch_start_byte);
+ punch(inode, punch_start_byte, end_byte - punch_start_byte,
+ iomap);
out_unlock:
filemap_invalidate_unlock(inode->i_mapping);
- return error;
}
/*
@@ -1291,20 +1274,20 @@ out_unlock:
* ->punch
* internal filesystem allocation lock
*/
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- struct iomap *iomap, loff_t pos, loff_t length,
- ssize_t written, iomap_punch_t punch)
+void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ loff_t pos, loff_t length, ssize_t written, unsigned flags,
+ struct iomap *iomap, iomap_punch_t punch)
{
loff_t start_byte;
loff_t end_byte;
unsigned int blocksize = i_blocksize(inode);
if (iomap->type != IOMAP_DELALLOC)
- return 0;
+ return;
/* If we didn't reserve the blocks, we're not allowed to punch them. */
if (!(iomap->flags & IOMAP_F_NEW))
- return 0;
+ return;
/*
* start_byte refers to the first unused block after a short write. If
@@ -1319,26 +1302,35 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
/* Nothing to do if we've written the entire delalloc extent */
if (start_byte >= end_byte)
- return 0;
+ return;
- return iomap_write_delalloc_release(inode, start_byte, end_byte,
- punch);
+ iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
+ punch);
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* don't bother with blocks that are not shared to start with */
+ /* Don't bother with blocks that are not shared to start with. */
if (!(iomap->flags & IOMAP_F_SHARED))
return length;
- /* don't bother with holes or unwritten extents */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+
+ /*
+ * Don't bother with holes or unwritten extents.
+ *
+ * Note that we use srcmap directly instead of iomap_iter_srcmap as
+ * unsharing requires providing a separate source map, and the presence
+ * of one is a good indicator that unsharing is needed, unlike
+ * IOMAP_F_SHARED which can be set for any data that goes into the COW
+ * fork for XFS.
+ */
+ if (iter->srcmap.type == IOMAP_HOLE ||
+ iter->srcmap.type == IOMAP_UNWRITTEN)
return length;
do {
@@ -1393,16 +1385,53 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+/*
+ * Flush the remaining range of the iter and mark the current mapping stale.
+ * This is used when zero range sees an unwritten mapping that may have had
+ * dirty pagecache over it.
+ */
+static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
+{
+ struct address_space *mapping = i->inode->i_mapping;
+ loff_t end = i->pos + i->len - 1;
+
+ i->iomap.flags |= IOMAP_F_STALE;
+ return filemap_write_and_wait_range(mapping, i->pos, end);
+}
+
+static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+ bool *range_dirty)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* already zeroed? we're done. */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ /*
+ * We must zero subranges of unwritten mappings that might be dirty in
+ * pagecache from previous writes. We only know whether the entire range
+ * was clean or not, however, and dirty folios may have been written
+ * back or reclaimed at any point after mapping lookup.
+ *
+ * The easiest way to deal with this is to flush pagecache to trigger
+ * any pending unwritten conversions and then grab the updated extents
+ * from the fs. The flush may change the current mapping, so mark it
+ * stale for the iterator to remap it for the next pass to handle
+ * properly.
+ *
+ * Note that holes are treated the same as unwritten because zero range
+ * is (ab)used for partial folio zeroing in some cases. Hole backed
+ * post-eof ranges can be dirtied via mapped write and the flush
+ * triggers writeback time post-eof zeroing.
+ */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) {
+ if (*range_dirty) {
+ *range_dirty = false;
+ return iomap_zero_iter_flush_and_stale(iter);
+ }
+ /* range is clean and already zeroed, nothing to do */
return length;
+ }
do {
struct folio *folio;
@@ -1450,9 +1479,27 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.flags = IOMAP_ZERO,
};
int ret;
+ bool range_dirty;
+
+ /*
+ * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but
+ * pagecache must be flushed to ensure stale data from previous
+ * buffered writes is not exposed. A flush is only required for certain
+ * types of mappings, but checking pagecache after mapping lookup is
+ * racy with writeback and reclaim.
+ *
+ * Therefore, check the entire range first and pass along whether any
+ * part of it is dirty. If so and an underlying mapping warrants it,
+ * flush the cache at that point. This trades off the occasional false
+ * positive (and spurious flush, if the dirty data and mapping don't
+ * happen to overlap) for simplicity in handling a relatively uncommon
+ * situation.
+ */
+ range_dirty = filemap_range_needs_writeback(inode->i_mapping,
+ pos, pos + len - 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.processed = iomap_zero_iter(&iter, did_zero, &range_dirty);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_zero_range);
@@ -2007,10 +2054,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-static int __init iomap_init(void)
+static int __init iomap_buffered_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
-fs_initcall(iomap_init);
+fs_initcall(iomap_buffered_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..f637aa0706a3 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -27,6 +27,13 @@
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
+/*
+ * Used for sub block zeroing in iomap_dio_zero()
+ */
+#define IOMAP_ZERO_PAGE_SIZE (SZ_64K)
+#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE))
+static struct page *zero_page;
+
struct iomap_dio {
struct kiocb *iocb;
const struct iomap_dio_ops *dops;
@@ -232,13 +239,20 @@ release_bio:
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
-static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
+static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
- struct page *page = ZERO_PAGE(0);
struct bio *bio;
+ if (!len)
+ return 0;
+ /*
+ * Max block size supported is 64k
+ */
+ if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE))
+ return -EINVAL;
+
bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
@@ -246,8 +260,9 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- __bio_add_page(bio, page, len, 0);
+ __bio_add_page(bio, zero_page, len, 0);
iomap_dio_submit_bio(iter, dio, bio, pos);
+ return 0;
}
/*
@@ -356,8 +371,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
- if (pad)
- iomap_dio_zero(iter, dio, pos - pad, pad);
+
+ ret = iomap_dio_zero(iter, dio, pos - pad, pad);
+ if (ret)
+ goto out;
}
/*
@@ -431,7 +448,8 @@ zero_tail:
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
- iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
+ ret = iomap_dio_zero(iter, dio, pos,
+ fs_block_size - pad);
}
out:
/* Undo iter limitation to current extent */
@@ -753,3 +771,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+static int __init iomap_dio_init(void)
+{
+ zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ IOMAP_ZERO_PAGE_ORDER);
+
+ if (!zero_page)
+ return -ENOMEM;
+
+ return 0;
+}
+fs_initcall(iomap_dio_init);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6aaec1246c95..e50d913ad32f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -1138,10 +1138,7 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
- if (!tmpbuffer)
- return -ENOMEM;
-
+ tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1205,7 +1202,7 @@ xfs_attr3_leaf_to_shortform(
error = 0;
out:
- kfree(tmpbuffer);
+ kvfree(tmpbuffer);
return error;
}
@@ -1613,7 +1610,7 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
+ tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1651,7 +1648,7 @@ xfs_attr3_leaf_compact(
*/
xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
- kfree(tmpbuffer);
+ kvfree(tmpbuffer);
}
/*
@@ -2330,7 +2327,7 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kzalloc(state->args->geo->blksize,
+ tmp_leaf = kvzalloc(state->args->geo->blksize,
GFP_KERNEL | __GFP_NOFAIL);
/*
@@ -2371,7 +2368,7 @@ xfs_attr3_leaf_unbalance(
}
memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
- kfree(tmp_leaf);
+ kvfree(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 20bb5ce38134..271855227514 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -3034,6 +3034,11 @@ xfs_ialloc_setup_geometry(
igeo->ialloc_align = mp->m_dalign;
else
igeo->ialloc_align = 0;
+
+ if (mp->m_sb.sb_blocksize > PAGE_SIZE)
+ igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
+ else
+ igeo->min_folio_order = 0;
}
/* Compute the location of the root directory inode that is laid out by mkfs. */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 2f7413afbf46..33b84a3a83ff 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -224,6 +224,9 @@ struct xfs_ino_geometry {
/* precomputed value for di_flags2 */
uint64_t new_diflags2;
+ /* minimum folio order of a page cache allocation */
+ unsigned int min_folio_order;
+
};
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e97d789495a5..412b1d71b52b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -760,7 +760,7 @@ write_retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 20d9924f28c2..a680e5b82672 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -100,7 +100,8 @@ xfs_inode_alloc(
/* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
- mapping_set_large_folios(VFS_I(ip)->i_mapping);
+ mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
+ M_IGEO(mp)->min_folio_order);
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -360,7 +361,8 @@ xfs_reinit_inode(
inode->i_uid = uid;
inode->i_gid = gid;
inode->i_state = state;
- mapping_set_large_folios(inode->i_mapping);
+ mapping_set_folio_min_order(inode->i_mapping,
+ M_IGEO(mp)->min_folio_order);
return error;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 72c981e3dc92..1e11f48814c0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1208,14 +1208,14 @@ out_unlock:
return error;
}
-static int
+static void
xfs_buffered_write_delalloc_punch(
struct inode *inode,
loff_t offset,
- loff_t length)
+ loff_t length,
+ struct iomap *iomap)
{
xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length);
- return 0;
}
static int
@@ -1227,17 +1227,8 @@ xfs_buffered_write_iomap_end(
unsigned flags,
struct iomap *iomap)
{
-
- struct xfs_mount *mp = XFS_M(inode->i_sb);
- int error;
-
- error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
- length, written, &xfs_buffered_write_delalloc_punch);
- if (error && !xfs_is_shutdown(mp)) {
- xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
- __func__, XFS_I(inode)->i_ino);
- return error;
- }
+ iomap_file_buffered_write_punch_delalloc(inode, offset, length, written,
+ flags, iomap, &xfs_buffered_write_delalloc_punch);
return 0;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1cdc8034f54d..ee79cf161312 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -567,7 +567,7 @@ xfs_stat_blksize(
return 1U << mp->m_allocsize_log;
}
- return PAGE_SIZE;
+ return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
STATIC int
@@ -870,16 +870,6 @@ xfs_setattr_size(
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
} else {
- /*
- * iomap won't detect a dirty page over an unwritten block (or a
- * cow block over a hole) and subsequently skips zeroing the
- * newly post-EOF portion of the page. Flush the new EOF to
- * convert the block before the pagecache truncate.
- */
- error = filemap_write_and_wait_range(inode->i_mapping, newsize,
- newsize);
- if (error)
- return error;
error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 460f93a9ce00..1fdd79c5bfa0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -132,11 +132,15 @@ xfs_sb_validate_fsb_count(
xfs_sb_t *sbp,
uint64_t nblocks)
{
- ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+ uint64_t max_bytes;
+
ASSERT(sbp->sb_blocklog >= BBSHIFT);
+ if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes))
+ return -EFBIG;
+
/* Limited by ULONG_MAX of page cache index */
- if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+ if (max_bytes >> PAGE_SHIFT > ULONG_MAX)
return -EFBIG;
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26767745d9fd..fbb3a1594c0d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1638,16 +1638,28 @@ xfs_fs_fill_super(
goto out_free_sb;
}
- /*
- * Until this is fixed only page-sized or smaller data blocks work.
- */
if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
- xfs_warn(mp,
- "File system with blocksize %d bytes. "
- "Only pagesize (%ld) or less will currently work.",
+ size_t max_folio_size = mapping_max_folio_size_supported();
+
+ if (!xfs_has_crc(mp)) {
+ xfs_warn(mp,
+"V4 Filesystem with blocksize %d bytes. Only pagesize (%ld) or less is supported.",
mp->m_sb.sb_blocksize, PAGE_SIZE);
- error = -ENOSYS;
- goto out_free_sb;
+ error = -ENOSYS;
+ goto out_free_sb;
+ }
+
+ if (mp->m_sb.sb_blocksize > max_folio_size) {
+ xfs_warn(mp,
+"block size (%u bytes) not supported; Only block size (%zu) or less is supported",
+ mp->m_sb.sb_blocksize, max_folio_size);
+ error = -ENOSYS;
+ goto out_free_sb;
+ }
+
+ xfs_warn(mp,
+"EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
+ mp->m_sb.sb_blocksize);
}
/* Ensure this filesystem fits in the page cache limits */
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 3b103715acc9..35166c92420c 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -563,7 +563,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL);
if (ret == -EIO)
zonefs_io_error(inode, true);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..7a570e0437c9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -96,6 +96,8 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
+#define split_folio(f) split_folio_to_list(f, NULL)
+
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PUD_SHIFT PUD_SHIFT
@@ -317,9 +319,24 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
bool can_split_folio(struct folio *folio, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
+int min_order_for_split(struct folio *folio);
+int split_folio_to_list(struct folio *folio, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
- return split_huge_page_to_list_to_order(page, NULL, 0);
+ struct folio *folio = page_folio(page);
+ int ret = min_order_for_split(folio);
+
+ if (ret < 0)
+ return ret;
+
+ /*
+ * split_huge_page() locks the page before splitting and
+ * expects the same page that has been split to be locked when
+ * returned. split_folio(page_folio(page)) cannot be used here
+ * because it converts the page to folio and passes the head
+ * page to be split.
+ */
+ return split_huge_page_to_list_to_order(page, NULL, ret);
}
void deferred_split_folio(struct folio *folio);
@@ -484,6 +501,12 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
+
+static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
+{
+ return 0;
+}
+
static inline void deferred_split_folio(struct folio *folio) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
@@ -598,7 +621,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
return split_folio_to_list_to_order(folio, NULL, new_order);
}
-#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
-#define split_folio(f) split_folio_to_order(f, 0)
-
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..4ad12a3c8bae 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -257,11 +257,7 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
}
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
- const struct iomap_ops *ops);
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
- int (*punch)(struct inode *inode, loff_t pos, loff_t length));
-
+ const struct iomap_ops *ops, void *private);
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
@@ -277,6 +273,13 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
const struct iomap_ops *ops);
+
+typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
+ struct iomap *iomap);
+void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
+ loff_t length, ssize_t written, unsigned flag,
+ struct iomap *iomap, iomap_punch_t punch);
+
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e39c3a7ce33c..68a5f1ff3301 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -206,14 +206,21 @@ enum mapping_flags {
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
- AS_LARGE_FOLIO_SUPPORT = 6,
- AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
- AS_STABLE_WRITES, /* must wait for writeback before modifying
+ AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */
+ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying
folio contents */
- AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping,
- including to move the mapping */
+ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
+ /* Bits 16-25 are used for FOLIO_ORDER */
+ AS_FOLIO_ORDER_BITS = 5,
+ AS_FOLIO_ORDER_MIN = 16,
+ AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
};
+#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
+#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
+#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
+#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)
+
/**
* mapping_set_error - record a writeback error in the address_space
* @mapping: the mapping in which an error should be set
@@ -369,9 +376,64 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
#define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1)
#define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
+/*
+ * mapping_max_folio_size_supported() - Check the max folio size supported
+ *
+ * The filesystem should call this function at mount time if there is a
+ * requirement on the folio mapping size in the page cache.
+ */
+static inline size_t mapping_max_folio_size_supported(void)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
+ return PAGE_SIZE;
+}
+
+/*
+ * mapping_set_folio_order_range() - Set the orders supported by a file.
+ * @mapping: The address space of the file.
+ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
+ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
+ *
+ * The filesystem should call this function in its inode constructor to
+ * indicate which base size (min) and maximum size (max) of folio the VFS
+ * can use to cache the contents of the file. This should only be used
+ * if the filesystem needs special handling of folio sizes (ie there is
+ * something the core cannot know).
+ * Do not tune it based on, eg, i_size.
+ *
+ * Context: This should not be called while the inode is active as it
+ * is non-atomic.
+ */
+static inline void mapping_set_folio_order_range(struct address_space *mapping,
+ unsigned int min,
+ unsigned int max)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ if (min > MAX_PAGECACHE_ORDER)
+ min = MAX_PAGECACHE_ORDER;
+
+ if (max > MAX_PAGECACHE_ORDER)
+ max = MAX_PAGECACHE_ORDER;
+
+ if (max < min)
+ max = min;
+
+ mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
+ (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
+}
+
+static inline void mapping_set_folio_min_order(struct address_space *mapping,
+ unsigned int min)
+{
+ mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
+}
+
/**
* mapping_set_large_folios() - Indicate the file supports large folios.
- * @mapping: The file.
+ * @mapping: The address space of the file.
*
* The filesystem should call this function in its inode constructor to
* indicate that the VFS can use large folios to cache the contents of
@@ -382,7 +444,44 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
*/
static inline void mapping_set_large_folios(struct address_space *mapping)
{
- __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+ mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
+}
+
+static inline unsigned int
+mapping_max_folio_order(const struct address_space *mapping)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 0;
+ return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
+}
+
+static inline unsigned int
+mapping_min_folio_order(const struct address_space *mapping)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 0;
+ return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
+}
+
+static inline unsigned long
+mapping_min_folio_nrpages(struct address_space *mapping)
+{
+ return 1UL << mapping_min_folio_order(mapping);
+}
+
+/**
+ * mapping_align_index() - Align index for this mapping.
+ * @mapping: The address_space.
+ * @index: The page index.
+ *
+ * The index of a folio must be naturally aligned. If you are adding a
+ * new folio to the page cache and need to know what index to give it,
+ * call this function.
+ */
+static inline pgoff_t mapping_align_index(struct address_space *mapping,
+ pgoff_t index)
+{
+ return round_down(index, mapping_min_folio_nrpages(mapping));
}
/*
@@ -391,20 +490,17 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
*/
static inline bool mapping_large_folio_support(struct address_space *mapping)
{
- /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
+ /* AS_FOLIO_ORDER is only reasonable for pagecache folios */
VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
"Anonymous mapping always supports large folio");
- return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+ return mapping_max_folio_order(mapping) > 0;
}
/* Return the maximum folio size for this pagecache mapping, in bytes. */
-static inline size_t mapping_max_folio_size(struct address_space *mapping)
+static inline size_t mapping_max_folio_size(const struct address_space *mapping)
{
- if (mapping_large_folio_support(mapping))
- return PAGE_SIZE << MAX_PAGECACHE_ORDER;
- return PAGE_SIZE;
+ return PAGE_SIZE << mapping_max_folio_order(mapping);
}
static inline int filemap_nr_thps(struct address_space *mapping)
diff --git a/mm/filemap.c b/mm/filemap.c
index 60a9cc593e9b..65c515e7bbf0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -859,6 +859,8 @@ noinline int __filemap_add_folio(struct address_space *mapping,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
+ folio);
mapping_set_update(&xas, mapping);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
@@ -1919,8 +1921,10 @@ repeat:
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
- unsigned order = FGF_GET_ORDER(fgp_flags);
+ unsigned int min_order = mapping_min_folio_order(mapping);
+ unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
int err;
+ index = mapping_align_index(mapping, index);
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
@@ -1933,10 +1937,8 @@ no_page:
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
- if (!mapping_large_folio_support(mapping))
- order = 0;
- if (order > MAX_PAGECACHE_ORDER)
- order = MAX_PAGECACHE_ORDER;
+ if (order > mapping_max_folio_order(mapping))
+ order = mapping_max_folio_order(mapping);
/* If we're not aligned, allocate a smaller folio */
if (index & ((1UL << order) - 1))
order = __ffs(index);
@@ -1945,7 +1947,7 @@ no_page:
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
- if (order > 0)
+ if (order > min_order)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
if (!folio)
@@ -1960,7 +1962,7 @@ no_page:
break;
folio_put(folio);
folio = NULL;
- } while (order-- > 0);
+ } while (order-- > min_order);
if (err == -EEXIST)
goto repeat;
@@ -2449,13 +2451,15 @@ unlock_mapping:
}
static int filemap_create_folio(struct file *file,
- struct address_space *mapping, pgoff_t index,
+ struct address_space *mapping, loff_t pos,
struct folio_batch *fbatch)
{
struct folio *folio;
int error;
+ unsigned int min_order = mapping_min_folio_order(mapping);
+ pgoff_t index;
- folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
+ folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
if (!folio)
return -ENOMEM;
@@ -2473,6 +2477,7 @@ static int filemap_create_folio(struct file *file,
* well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
+ index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
@@ -2533,8 +2538,7 @@ retry:
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
- err = filemap_create_folio(filp, mapping,
- iocb->ki_pos >> PAGE_SHIFT, fbatch);
+ err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
@@ -3611,7 +3615,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
struct vm_area_struct *vma = vmf->vma;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
- pgoff_t last_pgoff = start_pgoff;
+ pgoff_t file_end, last_pgoff = start_pgoff;
unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct folio *folio;
@@ -3637,6 +3641,10 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
+ if (end_pgoff > file_end)
+ end_pgoff = file_end;
+
folio_type = mm_counter_file(folio);
do {
unsigned long end;
@@ -3757,9 +3765,11 @@ static struct folio *do_read_cache_folio(struct address_space *mapping,
repeat:
folio = filemap_get_folio(mapping, index);
if (IS_ERR(folio)) {
- folio = filemap_alloc_folio(gfp, 0);
+ folio = filemap_alloc_folio(gfp,
+ mapping_min_folio_order(mapping));
if (!folio)
return ERR_PTR(-ENOMEM);
+ index = mapping_align_index(mapping, index);
err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
folio_put(folio);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 99b146d16a18..b71f744d360c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3081,6 +3081,9 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
* released, or if some unexpected race happened (e.g., anon VMA disappeared,
* truncation).
*
+ * Callers should ensure that the order respects the address space mapping
+ * min-order if one is set for non-anonymous folios.
+ *
* Returns -EINVAL when trying to split to an order that is incompatible
* with the folio. Splitting to order 0 is compatible with all folios.
*/
@@ -3162,6 +3165,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
+ unsigned int min_order;
gfp_t gfp;
mapping = folio->mapping;
@@ -3172,6 +3176,14 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
goto out;
}
+ min_order = mapping_min_folio_order(folio->mapping);
+ if (new_order < min_order) {
+ VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
+ min_order);
+ ret = -EINVAL;
+ goto out;
+ }
+
gfp = current_gfp_context(mapping_gfp_mask(mapping) &
GFP_RECLAIM_MASK);
@@ -3284,6 +3296,30 @@ out:
return ret;
}
+int min_order_for_split(struct folio *folio)
+{
+ if (folio_test_anon(folio))
+ return 0;
+
+ if (!folio->mapping) {
+ if (folio_test_pmd_mappable(folio))
+ count_vm_event(THP_SPLIT_PAGE_FAILED);
+ return -EBUSY;
+ }
+
+ return mapping_min_folio_order(folio->mapping);
+}
+
+int split_folio_to_list(struct folio *folio, struct list_head *list)
+{
+ int ret = min_order_for_split(folio);
+
+ if (ret < 0)
+ return ret;
+
+ return split_huge_page_to_list_to_order(&folio->page, list, ret);
+}
+
void __folio_undo_large_rmappable(struct folio *folio)
{
struct deferred_split *ds_queue;
@@ -3514,6 +3550,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
struct vm_area_struct *vma = vma_lookup(mm, addr);
struct page *page;
struct folio *folio;
+ struct address_space *mapping;
+ unsigned int target_order = new_order;
if (!vma)
break;
@@ -3534,7 +3572,13 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!is_transparent_hugepage(folio))
goto next;
- if (new_order >= folio_order(folio))
+ if (!folio_test_anon(folio)) {
+ mapping = folio->mapping;
+ target_order = max(new_order,
+ mapping_min_folio_order(mapping));
+ }
+
+ if (target_order >= folio_order(folio))
goto next;
total++;
@@ -3550,9 +3594,14 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!folio_trylock(folio))
goto next;
- if (!split_folio_to_order(folio, new_order))
+ if (!folio_test_anon(folio) && folio->mapping != mapping)
+ goto unlock;
+
+ if (!split_folio_to_order(folio, target_order))
split++;
+unlock:
+
folio_unlock(folio);
next:
folio_put(folio);
@@ -3577,6 +3626,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
pgoff_t index;
int nr_pages = 1;
unsigned long total = 0, split = 0;
+ unsigned int min_order;
+ unsigned int target_order;
file = getname_kernel(file_path);
if (IS_ERR(file))
@@ -3590,6 +3641,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
file_path, off_start, off_end);
mapping = candidate->f_mapping;
+ min_order = mapping_min_folio_order(mapping);
+ target_order = max(new_order, min_order);
for (index = off_start; index < off_end; index += nr_pages) {
struct folio *folio = filemap_get_folio(mapping, index);
@@ -3604,15 +3657,19 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
total++;
nr_pages = folio_nr_pages(folio);
- if (new_order >= folio_order(folio))
+ if (target_order >= folio_order(folio))
goto next;
if (!folio_trylock(folio))
goto next;
- if (!split_folio_to_order(folio, new_order))
+ if (folio->mapping != mapping)
+ goto unlock;
+
+ if (!split_folio_to_order(folio, target_order))
split++;
+unlock:
folio_unlock(folio);
next:
folio_put(folio);
diff --git a/mm/readahead.c b/mm/readahead.c
index 517c0be7ce66..2078c42777a6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -206,9 +206,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
- unsigned long index = readahead_index(ractl);
+ unsigned long ra_folio_index, index = readahead_index(ractl);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- unsigned long i;
+ unsigned long mark, i = 0;
+ unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);
/*
* Partway through the readahead operation, we will have added
@@ -223,10 +224,24 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned int nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
+ index = mapping_align_index(mapping, index);
+
+ /*
+ * As iterator `i` is aligned to min_nrpages, round_up the
+ * difference between nr_to_read and lookahead_size to mark the
+ * index that only has lookahead or "async_region" to set the
+ * readahead flag.
+ */
+ ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
+ min_nrpages);
+ mark = ra_folio_index - index;
+ nr_to_read += readahead_index(ractl) - index;
+ ractl->_index = index;
+
/*
* Preallocate as many pages as we will need.
*/
- for (i = 0; i < nr_to_read; i++) {
+ while (i < nr_to_read) {
struct folio *folio = xa_load(&mapping->i_pages, index + i);
int ret;
@@ -240,12 +255,13 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* not worth getting one just for that.
*/
read_pages(ractl);
- ractl->_index++;
- i = ractl->_index + ractl->_nr_pages - index - 1;
+ ractl->_index += min_nrpages;
+ i = ractl->_index + ractl->_nr_pages - index;
continue;
}
- folio = filemap_alloc_folio(gfp_mask, 0);
+ folio = filemap_alloc_folio(gfp_mask,
+ mapping_min_folio_order(mapping));
if (!folio)
break;
@@ -255,14 +271,15 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
if (ret == -ENOMEM)
break;
read_pages(ractl);
- ractl->_index++;
- i = ractl->_index + ractl->_nr_pages - index - 1;
+ ractl->_index += min_nrpages;
+ i = ractl->_index + ractl->_nr_pages - index;
continue;
}
- if (i == nr_to_read - lookahead_size)
+ if (i == mark)
folio_set_readahead(folio);
ractl->_workingset |= folio_test_workingset(folio);
- ractl->_nr_pages++;
+ ractl->_nr_pages += min_nrpages;
+ i += min_nrpages;
}
/*
@@ -438,26 +455,41 @@ void page_cache_ra_order(struct readahead_control *ractl,
struct address_space *mapping = ractl->mapping;
pgoff_t start = readahead_index(ractl);
pgoff_t index = start;
+ unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
pgoff_t mark = index + ra->size - ra->async_size;
unsigned int nofs;
int err = 0;
gfp_t gfp = readahead_gfp_mask(mapping);
+ unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
- if (!mapping_large_folio_support(mapping) || ra->size < 4)
+ /*
+ * Fallback when size < min_nrpages as each folio should be
+ * at least min_nrpages anyway.
+ */
+ if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
goto fallback;
limit = min(limit, index + ra->size - 1);
- if (new_order < MAX_PAGECACHE_ORDER)
+ if (new_order < mapping_max_folio_order(mapping))
new_order += 2;
- new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+ new_order = min(mapping_max_folio_order(mapping), new_order);
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
+ new_order = max(new_order, min_order);
/* See comment in page_cache_ra_unbounded() */
nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
+ /*
+ * If the new_order is greater than min_order and index is
+ * already aligned to new_order, then this will be noop as index
+ * aligned to new_order should also be aligned to min_order.
+ */
+ ractl->_index = mapping_align_index(mapping, index);
+ index = readahead_index(ractl);
+
while (index <= limit) {
unsigned int order = new_order;
@@ -465,7 +497,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
if (index & ((1UL << order) - 1))
order = __ffs(index);
/* Don't allocate pages past EOF */
- while (index + (1UL << order) - 1 > limit)
+ while (order > min_order && index + (1UL << order) - 1 > limit)
order--;
err = ra_alloc_folio(ractl, index, mark, order, gfp);
if (err)
@@ -703,8 +735,15 @@ void readahead_expand(struct readahead_control *ractl,
struct file_ra_state *ra = ractl->ra;
pgoff_t new_index, new_nr_pages;
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ unsigned long min_nrpages = mapping_min_folio_nrpages(mapping);
+ unsigned int min_order = mapping_min_folio_order(mapping);
new_index = new_start / PAGE_SIZE;
+ /*
+ * Readahead code should have aligned the ractl->_index to
+ * min_nrpages before calling readahead aops.
+ */
+ VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages));
/* Expand the leading edge downwards */
while (ractl->_index > new_index) {
@@ -714,9 +753,11 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, 0);
+ folio = filemap_alloc_folio(gfp_mask, min_order);
if (!folio)
return;
+
+ index = mapping_align_index(mapping, index);
if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
folio_put(folio);
return;
@@ -726,7 +767,7 @@ void readahead_expand(struct readahead_control *ractl,
ractl->_workingset = true;
psi_memstall_enter(&ractl->_pflags);
}
- ractl->_nr_pages++;
+ ractl->_nr_pages += min_nrpages;
ractl->_index = folio->index;
}
@@ -741,9 +782,11 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, 0);
+ folio = filemap_alloc_folio(gfp_mask, min_order);
if (!folio)
return;
+
+ index = mapping_align_index(mapping, index);
if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
folio_put(folio);
return;
@@ -753,10 +796,10 @@ void readahead_expand(struct readahead_control *ractl,
ractl->_workingset = true;
psi_memstall_enter(&ractl->_pflags);
}
- ractl->_nr_pages++;
+ ractl->_nr_pages += min_nrpages;
if (ra) {
- ra->size++;
- ra->async_size++;
+ ra->size += min_nrpages;
+ ra->async_size += min_nrpages;
}
}
}