summaryrefslogtreecommitdiff
path: root/fs/erofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig53
-rw-r--r--fs/erofs/Makefile6
-rw-r--r--fs/erofs/compress.h67
-rw-r--r--fs/erofs/data.c179
-rw-r--r--fs/erofs/decompressor.c229
-rw-r--r--fs/erofs/decompressor_deflate.c146
-rw-r--r--fs/erofs/decompressor_lzma.c165
-rw-r--r--fs/erofs/decompressor_zstd.c225
-rw-r--r--fs/erofs/dir.c37
-rw-r--r--fs/erofs/erofs_fs.h20
-rw-r--r--fs/erofs/fileio.c197
-rw-r--r--fs/erofs/fscache.c308
-rw-r--r--fs/erofs/inode.c274
-rw-r--r--fs/erofs/internal.h141
-rw-r--r--fs/erofs/namei.c6
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c369
-rw-r--r--fs/erofs/sysfs.c30
-rw-r--r--fs/erofs/utils.c282
-rw-r--r--fs/erofs/xattr.c41
-rw-r--r--fs/erofs/xattr.h4
-rw-r--r--fs/erofs/zdata.c1232
-rw-r--r--fs/erofs/zmap.c358
-rw-r--r--fs/erofs/zutil.c317
24 files changed, 2592 insertions, 2242 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f6dc961e6c2b..6ea60661fa55 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -21,7 +21,7 @@ config EROFS_FS
performance under extremely memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
- for more details.
+ and the web pages at <https://erofs.docs.kernel.org> for more details.
If unsure, say N.
@@ -74,6 +74,23 @@ config EROFS_FS_SECURITY
If you are not using a security module, say N.
+config EROFS_FS_BACKED_BY_FILE
+ bool "File-backed EROFS filesystem support"
+ depends on EROFS_FS
+ default y
+ help
+ This allows EROFS to use filesystem image files directly, without
+ the intercession of loopback block devices or likewise. It is
+ particularly useful for container images with numerous blobs and
+ other sandboxes, where loop devices behave intricately. It can also
+ be used to simplify error-prone lifetime management of unnecessary
+ virtual block devices.
+
+ Note that this feature, along with ongoing fanotify pre-content
+ hooks, will eventually replace "EROFS over fscache."
+
+ If you don't want to enable this feature, say N.
+
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
@@ -91,13 +108,10 @@ config EROFS_FS_ZIP_LZMA
select XZ_DEC_MICROLZMA
help
Saying Y here includes support for reading EROFS file systems
- containing LZMA compressed data, specifically called microLZMA. it
- gives better compression ratios than the LZ4 algorithm, at the
+ containing LZMA compressed data, specifically called microLZMA. It
+ gives better compression ratios than the default LZ4 format, at the
expense of more CPU overhead.
- LZMA support is an experimental feature for now and so most file
- systems will be readable without selecting this option.
-
If unsure, say N.
config EROFS_FS_ZIP_DEFLATE
@@ -115,14 +129,35 @@ config EROFS_FS_ZIP_DEFLATE
If unsure, say N.
+config EROFS_FS_ZIP_ZSTD
+ bool "EROFS Zstandard compressed data support"
+ depends on EROFS_FS_ZIP
+ select ZSTD_DECOMPRESS
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing Zstandard compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ Zstandard support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
- bool "EROFS fscache-based on-demand read support"
- depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
- default n
+ bool "EROFS fscache-based on-demand read support (deprecated)"
+ depends on EROFS_FS
+ select NETFS_SUPPORT
+ select FSCACHE
+ select CACHEFILES
+ select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
+ It is now deprecated and scheduled to be removed from the kernel
+ after fanotify pre-content hooks are landed.
+
If unsure, say N.
config EROFS_FS_PCPU_KTHREAD
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 994d0b9deddf..4331d53c7109 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,9 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 279933e007d2..7bfe251680ec 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,13 +11,12 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
-
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
- /* indicate the algorithm will be used for decompression */
- unsigned int alg;
+ unsigned int alg; /* the algorithm for decompression */
bool inplace_io, partial_decoding, fillgaps;
+ gfp_t gfp; /* allocation flags for extra temporary buffers */
};
struct z_erofs_decompressor {
@@ -25,6 +24,8 @@ struct z_erofs_decompressor {
void *data, int size);
int (*decompress)(struct z_erofs_decompress_req *rq,
struct page **pagepool);
+ int (*init)(void);
+ void (*exit)(void);
char *name;
};
@@ -53,17 +54,14 @@ struct z_erofs_decompressor {
*/
/*
- * short-lived pages are pages directly from buddy system with specific
- * page->private (no need to set PagePrivate since these are non-LRU /
- * non-movable pages and bypass reclaim / migration code).
+ * Currently, short-lived pages are pages directly from buddy system
+ * with specific page->private (Z_EROFS_SHORTLIVED_PAGE).
+ * In the future world of Memdescs, it should be type 0 (Misc) memory
+ * which type can be checked with a new helper.
*/
static inline bool z_erofs_is_shortlived_page(struct page *page)
{
- if (page->private != Z_EROFS_SHORTLIVED_PAGE)
- return false;
-
- DBG_BUGON(page->mapping);
- return true;
+ return page->private == Z_EROFS_SHORTLIVED_PAGE;
}
static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
@@ -71,35 +69,32 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
{
if (!z_erofs_is_shortlived_page(page))
return false;
-
- /* short-lived pages should not be used by others at the same time */
- if (page_ref_count(page) > 1) {
- put_page(page);
- } else {
- /* follow the pcluster rule above. */
- erofs_pagepool_add(pagepool, page);
- }
+ erofs_pagepool_add(pagepool, page);
return true;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
+extern const struct z_erofs_decompressor z_erofs_lzma_decomp;
+extern const struct z_erofs_decompressor z_erofs_deflate_decomp;
+extern const struct z_erofs_decompressor z_erofs_zstd_decomp;
+extern const struct z_erofs_decompressor *z_erofs_decomp[];
+
+struct z_erofs_stream_dctx {
+ struct z_erofs_decompress_req *rq;
+ unsigned int inpages, outpages; /* # of {en,de}coded pages */
+ int no, ni; /* the current {en,de}coded page # */
+ unsigned int avail_out; /* remaining bytes in the decoded buffer */
+ unsigned int inbuf_pos, inbuf_sz;
+ /* current status of the encoded buffer */
+ u8 *kin, *kout; /* buffer mapped pointers */
+ void *bounce; /* bounce buffer for inplace I/Os */
+ bool bounced; /* is the bounce buffer used now? */
+};
+
+int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
+ void **src, struct page **pgpl);
int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
-extern const struct z_erofs_decompressor erofs_decompressors[];
-
-/* prototypes for specific algorithms */
-int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_load_deflate_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
-int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+int __init z_erofs_init_decompressor(void);
+void z_erofs_exit_decompressor(void);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 19ab9bb3a9a0..91182d5e3a66 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,9 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
-#include <linux/prefetch.h>
#include <linux/sched/mm.h>
-#include <linux/dax.h>
#include <trace/events/erofs.h>
void erofs_unmap_metabuf(struct erofs_buf *buf)
@@ -23,40 +21,32 @@ void erofs_put_metabuf(struct erofs_buf *buf)
if (!buf->page)
return;
erofs_unmap_metabuf(buf);
- put_page(buf->page);
+ folio_put(page_folio(buf->page));
buf->page = NULL;
}
-/*
- * Derive the block size from inode->i_blkbits to make compatible with
- * anonymous inode in fscache mode.
- */
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type)
{
- struct inode *inode = buf->inode;
- erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits;
pgoff_t index = offset >> PAGE_SHIFT;
- struct page *page = buf->page;
- struct folio *folio;
- unsigned int nofs_flag;
+ struct folio *folio = NULL;
- if (!page || page->index != index) {
+ if (buf->page) {
+ folio = page_folio(buf->page);
+ if (folio_file_page(folio, index) != buf->page)
+ erofs_unmap_metabuf(buf);
+ }
+ if (!folio || !folio_contains(folio, index)) {
erofs_put_metabuf(buf);
-
- nofs_flag = memalloc_nofs_save();
- folio = read_cache_folio(inode->i_mapping, index, NULL, NULL);
- memalloc_nofs_restore(nofs_flag);
+ folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
-
- /* should already be PageUptodate, no need to lock page */
- page = folio_file_page(folio, index);
- buf->page = page;
}
+ buf->page = folio_file_page(folio, index);
+
if (buf->kmap_type == EROFS_NO_KMAP) {
if (type == EROFS_KMAP)
- buf->base = kmap_local_page(page);
+ buf->base = kmap_local_page(buf->page);
buf->kmap_type = type;
} else if (buf->kmap_type != type) {
DBG_BUGON(1);
@@ -69,54 +59,50 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
- if (erofs_is_fscache_mode(sb))
- buf->inode = EROFS_SB(sb)->s_fscache->inode;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ buf->file = NULL;
+ if (erofs_is_fileio_mode(sbi)) {
+ buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
+ buf->mapping = buf->file->f_mapping;
+ } else if (erofs_is_fscache_mode(sb))
+ buf->mapping = sbi->dif0.fscache->inode->i_mapping;
else
- buf->inode = sb->s_bdev->bd_inode;
+ buf->mapping = sb->s_bdev->bd_mapping;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+ erofs_off_t offset, enum erofs_kmap_type type)
{
erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, blkaddr, type);
+ return erofs_bread(buf, offset, type);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map)
{
- erofs_blk_t nblocks, lastblk;
- u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
struct super_block *sb = inode->i_sb;
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;
- nblocks = erofs_iblks(inode);
- lastblk = nblocks - tailendpacking;
-
- /* there is no hole in flatmode */
- map->m_flags = EROFS_MAP_MAPPED;
- if (offset < erofs_pos(sb, lastblk)) {
+ map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */
+ if (map->m_la < erofs_pos(sb, lastblk)) {
map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
- map->m_plen = erofs_pos(sb, lastblk) - offset;
- } else if (tailendpacking) {
+ map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
+ } else {
+ DBG_BUGON(!tailendpacking);
map->m_pa = erofs_iloc(inode) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(sb, offset);
- map->m_plen = inode->i_size - offset;
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_plen = inode->i_size - map->m_la;
/* inline data should be located in the same meta block */
if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "inline data cross block boundary @ nid %llu",
- vi->nid);
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
map->m_flags |= EROFS_MAP_META;
- } else {
- erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx",
- vi->nid, inode->i_size, map->m_la);
- DBG_BUGON(1);
- return -EIO;
}
return 0;
}
@@ -138,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
- map->m_plen = 0;
+ map->m_plen = map->m_llen;
goto out;
}
@@ -156,7 +142,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
+ kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
if (IS_ERR(kaddr)) {
err = PTR_ERR(kaddr);
goto out;
@@ -167,7 +153,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos);
+ __le32 *blkaddr = kaddr;
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
@@ -178,7 +164,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
goto out_unlock;
}
/* parse chunk indexes */
- idx = kaddr + erofs_blkoff(sb, pos);
+ idx = kaddr;
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
@@ -199,17 +185,25 @@ out:
return err;
}
+static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
+ struct super_block *sb, struct erofs_device_info *dif)
+{
+ map->m_sb = sb;
+ map->m_dif = dif;
+ map->m_bdev = NULL;
+ if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
+ map->m_bdev = file_bdev(dif->file);
+}
+
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
+ erofs_off_t startoff, length;
int id;
- map->m_bdev = sb->s_bdev;
- map->m_daxdev = EROFS_SB(sb)->dax_dev;
- map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
- map->m_fscache = EROFS_SB(sb)->s_fscache;
-
+ erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
+ map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */
if (map->m_deviceid) {
down_read(&devs->rwsem);
dif = idr_find(&devs->tree, map->m_deviceid - 1);
@@ -222,29 +216,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, sb, dif);
up_read(&devs->rwsem);
} else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- erofs_off_t startoff, length;
-
if (!dif->mapped_blkaddr)
continue;
+
startoff = erofs_pos(sb, dif->mapped_blkaddr);
length = erofs_pos(sb, dif->blocks);
-
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev_handle ?
- dif->bdev_handle->bdev : NULL;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, sb, dif);
break;
}
}
@@ -253,6 +238,48 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return 0;
}
+/*
+ * bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
+ * bit 0 - 29: remaining parts to complete this folio
+ */
+#define EROFS_ONLINEFOLIO_EIO 30
+#define EROFS_ONLINEFOLIO_DIRTY 29
+
+void erofs_onlinefolio_init(struct folio *folio)
+{
+ union {
+ atomic_t o;
+ void *v;
+ } u = { .o = ATOMIC_INIT(1) };
+
+ folio->private = u.v; /* valid only if file-backed folio is locked */
+}
+
+void erofs_onlinefolio_split(struct folio *folio)
+{
+ atomic_inc((atomic_t *)&folio->private);
+}
+
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
+{
+ int orig, v;
+
+ do {
+ orig = atomic_read((atomic_t *)&folio->private);
+ DBG_BUGON(orig <= 0);
+ v = dirty << EROFS_ONLINEFOLIO_DIRTY;
+ v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
+ } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
+
+ if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
+ return;
+ folio->private = 0;
+ if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
+ flush_dcache_folio(folio);
+ folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -278,7 +305,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->offset = map.m_la;
if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_daxdev;
+ iomap->dax_dev = mdev.m_dif->dax_dev;
else
iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
@@ -298,17 +325,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb,
- erofs_blknr(sb, mdev.m_pa), EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
- iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa);
+ iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = mdev.m_pa;
if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dax_part_off;
+ iomap->addr += mdev.m_dif->dax_part_off;
}
return 0;
}
@@ -358,11 +384,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
+ trace_erofs_read_folio(folio, true);
+
return iomap_read_folio(folio, &erofs_iomap_ops);
}
static void erofs_readahead(struct readahead_control *rac)
{
+ trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
+ readahead_count(rac), true);
+
return iomap_readahead(rac, &erofs_iomap_ops);
}
@@ -403,7 +434,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/* for uncompressed (aligned) files and raw access for other files */
-const struct address_space_operations erofs_raw_access_aops = {
+const struct address_space_operations erofs_aops = {
.read_folio = erofs_read_folio,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index aa59788a61e6..dc61a6a8f696 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -2,9 +2,9 @@
/*
* Copyright (C) 2019 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "compress.h"
-#include <linux/module.h>
#include <linux/lz4.h>
#ifndef LZ4_DISTANCE_MAX /* history window size */
@@ -55,7 +55,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
sbi->lz4.max_distance_pages = distance ?
DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
LZ4_MAX_DISTANCE_PAGES;
- return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+ return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
}
/*
@@ -110,10 +110,10 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
if (top) {
victim = availables[--top];
- get_page(victim);
} else {
- victim = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ victim = __erofs_allocpage(pagepool, rq->gfp, true);
+ if (!victim)
+ return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
@@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = erofs_get_pcpubuf(ctx->inpages);
+ src = z_erofs_get_gbuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_local(inpage);
@@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
} else if (maptype == 1) {
vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
- erofs_put_pcpubuf(src);
+ z_erofs_put_gbuf(src);
} else if (maptype != 3) {
DBG_BUGON(1);
return -EFAULT;
@@ -315,73 +315,165 @@ dstmap_out:
static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- const unsigned int outpages =
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
- PAGE_SIZE - rq->pageofs_out);
- const unsigned int lefthalf = rq->outputsize - righthalf;
- const unsigned int interlaced_offset =
- rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
- u8 *src;
-
- if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ const unsigned int bs = rq->sb->s_blocksize;
+ unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
+ u8 *kin;
+
+ if (rq->outputsize > rq->inputsize)
+ return -EOPNOTSUPP;
+ if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
+ cur = bs - (rq->pageofs_out & (bs - 1));
+ pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
+ cur = min(cur, rq->outputsize);
+ if (cur && rq->out[0]) {
+ kin = kmap_local_page(rq->in[nrpages_in - 1]);
+ if (rq->out[0] == rq->in[nrpages_in - 1])
+ memmove(kin + rq->pageofs_out, kin + pi, cur);
+ else
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ kin + pi, cur);
+ kunmap_local(kin);
+ }
+ rq->outputsize -= cur;
}
- if (rq->out[0] == *rq->in) {
- DBG_BUGON(rq->pageofs_out);
- return 0;
+ for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) {
+ insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize);
+ rq->outputsize -= insz;
+ if (!rq->in[ni])
+ continue;
+ kin = kmap_local_page(rq->in[ni]);
+ pi = 0;
+ do {
+ no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT;
+ po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
+ DBG_BUGON(no >= nrpages_out);
+ cnt = min(insz - pi, PAGE_SIZE - po);
+ if (rq->out[no] == rq->in[ni])
+ memmove(kin + po,
+ kin + rq->pageofs_in + pi, cnt);
+ else if (rq->out[no])
+ memcpy_to_page(rq->out[no], po,
+ kin + rq->pageofs_in + pi, cnt);
+ pi += cnt;
+ } while (pi < insz);
+ kunmap_local(kin);
+ }
+ DBG_BUGON(ni > nrpages_in);
+ return 0;
+}
+
+int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
+ void **src, struct page **pgpl)
+{
+ struct z_erofs_decompress_req *rq = dctx->rq;
+ struct super_block *sb = rq->sb;
+ struct page **pgo, *tmppage;
+ unsigned int j;
+
+ if (!dctx->avail_out) {
+ if (++dctx->no >= dctx->outpages || !rq->outputsize) {
+ erofs_err(sb, "insufficient space for decompressed data");
+ return -EFSCORRUPTED;
+ }
+
+ if (dctx->kout)
+ kunmap_local(dctx->kout);
+ dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out);
+ rq->outputsize -= dctx->avail_out;
+ pgo = &rq->out[dctx->no];
+ if (!*pgo && rq->fillgaps) { /* deduped */
+ *pgo = erofs_allocpage(pgpl, rq->gfp);
+ if (!*pgo) {
+ dctx->kout = NULL;
+ return -ENOMEM;
+ }
+ set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE);
+ }
+ if (*pgo) {
+ dctx->kout = kmap_local_page(*pgo);
+ *dst = dctx->kout + rq->pageofs_out;
+ } else {
+ *dst = dctx->kout = NULL;
+ }
+ rq->pageofs_out = 0;
}
- src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
- if (rq->out[0])
- memcpy_to_page(rq->out[0], rq->pageofs_out,
- src + interlaced_offset, righthalf);
-
- if (outpages > inpages) {
- DBG_BUGON(!rq->out[outpages - 1]);
- if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
- memcpy_to_page(rq->out[outpages - 1], 0, src +
- (interlaced_offset ? 0 : righthalf),
- lefthalf);
- } else if (!interlaced_offset) {
- memmove(src, src + righthalf, lefthalf);
- flush_dcache_page(rq->in[inpages - 1]);
+ if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
+ if (++dctx->ni >= dctx->inpages) {
+ erofs_err(sb, "invalid compressed data");
+ return -EFSCORRUPTED;
+ }
+ if (dctx->kout) /* unlike kmap(), take care of the orders */
+ kunmap_local(dctx->kout);
+ kunmap_local(dctx->kin);
+
+ dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE);
+ rq->inputsize -= dctx->inbuf_sz;
+ dctx->kin = kmap_local_page(rq->in[dctx->ni]);
+ *src = dctx->kin;
+ dctx->bounced = false;
+ if (dctx->kout) {
+ j = (u8 *)*dst - dctx->kout;
+ dctx->kout = kmap_local_page(rq->out[dctx->no]);
+ *dst = dctx->kout + j;
}
+ dctx->inbuf_pos = 0;
+ }
+
+ /*
+ * Handle overlapping: Use the given bounce buffer if the input data is
+ * under processing; Or utilize short-lived pages from the on-stack page
+ * pool, where pages are shared among the same request. Note that only
+ * a few inplace I/O pages need to be doubled.
+ */
+ if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) {
+ memcpy(dctx->bounce, *src, dctx->inbuf_sz);
+ *src = dctx->bounce;
+ dctx->bounced = true;
+ }
+
+ for (j = dctx->ni + 1; j < dctx->inpages; ++j) {
+ if (rq->out[dctx->no] != rq->in[j])
+ continue;
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage)
+ return -ENOMEM;
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
}
- kunmap_local(src);
return 0;
}
-const struct z_erofs_decompressor erofs_decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
+const struct z_erofs_decompressor *z_erofs_decomp[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "shifted"
},
- [Z_EROFS_COMPRESSION_INTERLACED] = {
+ [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "interlaced"
},
- [Z_EROFS_COMPRESSION_LZ4] = {
+ [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) {
.config = z_erofs_load_lz4_config,
.decompress = z_erofs_lz4_decompress,
+ .init = z_erofs_gbuf_init,
+ .exit = z_erofs_gbuf_exit,
.name = "lz4"
},
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
- [Z_EROFS_COMPRESSION_LZMA] = {
- .config = z_erofs_load_lzma_config,
- .decompress = z_erofs_lzma_decompress,
- .name = "lzma"
- },
+ [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp,
#endif
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
- [Z_EROFS_COMPRESSION_DEFLATE] = {
- .config = z_erofs_load_deflate_config,
- .decompress = z_erofs_deflate_decompress,
- .name = "deflate"
- },
+ [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp,
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ZSTD
+ [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp,
#endif
};
@@ -409,6 +501,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
alg = 0;
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ const struct z_erofs_decompressor *dec = z_erofs_decomp[alg];
void *data;
if (!(algs & 1))
@@ -420,16 +513,13 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
break;
}
- if (alg >= ARRAY_SIZE(erofs_decompressors) ||
- !erofs_decompressors[alg].config) {
+ if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) {
+ ret = dec->config(sb, dsb, data, size);
+ } else {
erofs_err(sb, "algorithm %d isn't enabled on this kernel",
alg);
ret = -EOPNOTSUPP;
- } else {
- ret = erofs_decompressors[alg].config(sb,
- dsb, data, size);
}
-
kfree(data);
if (ret)
break;
@@ -437,3 +527,28 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
erofs_put_metabuf(&buf);
return ret;
}
+
+int __init z_erofs_init_decompressor(void)
+{
+ int i, err;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) {
+ err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0;
+ if (err) {
+ while (i--)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+ return err;
+ }
+ }
+ return 0;
+}
+
+void z_erofs_exit_decompressor(void)
+{
+ int i;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index aac2c837ef35..5070d2fcc737 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/module.h>
#include <linux/zlib.h>
#include "compress.h"
@@ -16,7 +15,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
-void z_erofs_deflate_exit(void)
+static void z_erofs_deflate_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_deflate_avail_strms) {
@@ -42,7 +41,7 @@ void z_erofs_deflate_exit(void)
}
}
-int __init z_erofs_deflate_init(void)
+static int __init z_erofs_deflate_init(void)
{
/* by default, use # of possible CPUs instead */
if (!z_erofs_deflate_nstrms)
@@ -50,7 +49,7 @@ int __init z_erofs_deflate_init(void)
return 0;
}
-int z_erofs_load_deflate_config(struct super_block *sb,
+static int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
struct z_erofs_deflate_cfgs *dfl = data;
@@ -98,27 +97,26 @@ failed:
return -ENOMEM;
}
-int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
struct super_block *sb = rq->sb;
- unsigned int insz, outsz, pofs;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
struct z_erofs_deflate *strm;
- u8 *kin, *kout = NULL;
- bool bounced = false;
- int no = -1, ni = 0, j = 0, zerr, err;
+ int zerr, err;
/* 1. get the exact DEFLATE compressed size */
- kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
return err;
}
@@ -135,98 +133,35 @@ again:
spin_unlock(&z_erofs_deflate_lock);
/* 3. multi-call decompress */
- insz = rq->inputsize;
- outsz = rq->outputsize;
zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
if (zerr != Z_OK) {
err = -EIO;
goto failed_zinit;
}
- pofs = rq->pageofs_out;
- strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
- insz -= strm->z.avail_in;
- strm->z.next_in = kin + rq->pageofs_in;
+ rq->fillgaps = true; /* DEFLATE doesn't support NULL output buffer */
+ strm->z.avail_in = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= strm->z.avail_in;
+ strm->z.next_in = dctx.kin + rq->pageofs_in;
strm->z.avail_out = 0;
+ dctx.bounce = strm->bounce;
while (1) {
- if (!strm->z.avail_out) {
- if (++no >= nrpages_out || !outsz) {
- erofs_err(sb, "insufficient space for decompressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout)
- kunmap_local(kout);
- strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
- outsz -= strm->z.avail_out;
- if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- kout = kmap_local_page(rq->out[no]);
- strm->z.next_out = kout + pofs;
- pofs = 0;
- }
-
- if (!strm->z.avail_in && insz) {
- if (++ni >= nrpages_in) {
- erofs_err(sb, "invalid compressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout) { /* unlike kmap(), take care of the orders */
- j = strm->z.next_out - kout;
- kunmap_local(kout);
- }
- kunmap_local(kin);
- strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
- insz -= strm->z.avail_in;
- kin = kmap_local_page(rq->in[ni]);
- strm->z.next_in = kin;
- bounced = false;
- if (kout) {
- kout = kmap_local_page(rq->out[no]);
- strm->z.next_out = kout + j;
- }
- }
-
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Or use short-lived pages from the
- * on-stack pagepool where pages share among the same request
- * and not _all_ inplace I/O pages are needed to be doubled.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
- strm->z.next_in = strm->bounce;
- bounced = true;
- }
-
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
-
- if (rq->out[no] != rq->in[j])
- continue;
-
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
- rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
+ dctx.avail_out = strm->z.avail_out;
+ dctx.inbuf_sz = strm->z.avail_in;
+ err = z_erofs_stream_switch_bufs(&dctx,
+ (void **)&strm->z.next_out,
+ (void **)&strm->z.next_in, pgpl);
+ if (err)
+ break;
+ strm->z.avail_out = dctx.avail_out;
+ strm->z.avail_in = dctx.inbuf_sz;
zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
- if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
+ if (zerr != Z_OK || !(rq->outputsize + strm->z.avail_out)) {
if (zerr == Z_OK && rq->partial_decoding)
break;
- if (zerr == Z_STREAM_END && !outsz)
+ if (zerr == Z_STREAM_END && !rq->outputsize)
break;
erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
zerr, rq->inputsize, rq->outputsize);
@@ -234,13 +169,12 @@ again:
break;
}
}
-
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
- if (kout)
- kunmap_local(kout);
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
failed_zinit:
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
/* 4. push back DEFLATE stream context to the global list */
spin_lock(&z_erofs_deflate_lock);
strm->next = z_erofs_deflate_head;
@@ -249,3 +183,11 @@ failed_zinit:
wake_up(&z_erofs_deflate_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_deflate_decomp = {
+ .config = z_erofs_load_deflate_config,
+ .decompress = z_erofs_deflate_decompress,
+ .init = z_erofs_deflate_init,
+ .exit = z_erofs_deflate_exit,
+ .name = "deflate",
+};
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index ba4ec73f4aae..40666815046f 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -1,12 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/xz.h>
-#include <linux/module.h>
#include "compress.h"
struct z_erofs_lzma {
struct z_erofs_lzma *next;
struct xz_dec_microlzma *state;
- struct xz_buf buf;
u8 bounce[PAGE_SIZE];
};
@@ -19,7 +17,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
-void z_erofs_lzma_exit(void)
+static void z_erofs_lzma_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_lzma_avail_strms) {
@@ -47,7 +45,7 @@ void z_erofs_lzma_exit(void)
}
}
-int __init z_erofs_lzma_init(void)
+static int __init z_erofs_lzma_init(void)
{
unsigned int i;
@@ -71,7 +69,7 @@ int __init z_erofs_lzma_init(void)
return 0;
}
-int z_erofs_load_lzma_config(struct super_block *sb,
+static int z_erofs_load_lzma_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
static DEFINE_MUTEX(lzma_resize_mutex);
@@ -96,8 +94,6 @@ int z_erofs_load_lzma_config(struct super_block *sb,
return -EINVAL;
}
- erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
-
/* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
mutex_lock(&lzma_resize_mutex);
@@ -150,26 +146,28 @@ again:
return err;
}
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- unsigned int inlen, outlen, pageofs;
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
+ struct xz_buf buf = {};
struct z_erofs_lzma *strm;
- u8 *kin;
- bool bounced = false;
- int no, ni, j, err = 0;
+ enum xz_ret xz_err;
+ int err;
/* 1. get the exact LZMA compressed size */
- kin = kmap(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- rq->sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap(*rq->in);
+ kunmap_local(dctx.kin);
return err;
}
@@ -186,104 +184,45 @@ again:
spin_unlock(&z_erofs_lzma_lock);
/* 3. multi-call decompress */
- inlen = rq->inputsize;
- outlen = rq->outputsize;
- xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ xz_dec_microlzma_reset(strm->state, rq->inputsize, rq->outputsize,
!rq->partial_decoding);
- pageofs = rq->pageofs_out;
- strm->buf.in = kin + rq->pageofs_in;
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
- inlen -= strm->buf.in_size;
- strm->buf.out = NULL;
- strm->buf.out_pos = 0;
- strm->buf.out_size = 0;
-
- for (ni = 0, no = -1;;) {
- enum xz_ret xz_err;
-
- if (strm->buf.out_pos == strm->buf.out_size) {
- if (strm->buf.out) {
- kunmap(rq->out[no]);
- strm->buf.out = NULL;
- }
-
- if (++no >= nrpages_out || !outlen) {
- erofs_err(rq->sb, "decompressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.out_pos = 0;
- strm->buf.out_size = min_t(u32, outlen,
- PAGE_SIZE - pageofs);
- outlen -= strm->buf.out_size;
- if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- if (rq->out[no])
- strm->buf.out = kmap(rq->out[no]) + pageofs;
- pageofs = 0;
- } else if (strm->buf.in_pos == strm->buf.in_size) {
- kunmap(rq->in[ni]);
-
- if (++ni >= nrpages_in || !inlen) {
- erofs_err(rq->sb, "compressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
- inlen -= strm->buf.in_size;
- kin = kmap(rq->in[ni]);
- strm->buf.in = kin;
- bounced = false;
- }
+ buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= buf.in_size;
+ buf.in = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+ do {
+ dctx.avail_out = buf.out_size - buf.out_pos;
+ dctx.inbuf_sz = buf.in_size;
+ dctx.inbuf_pos = buf.in_pos;
+ err = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out,
+ (void **)&buf.in, pgpl);
+ if (err)
+ break;
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Otherwise, Use short-lived pages
- * from the on-stack pagepool where pages share with the same
- * request.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
- strm->buf.in = strm->bounce;
- bounced = true;
+ if (buf.out_size == buf.out_pos) {
+ buf.out_size = dctx.avail_out;
+ buf.out_pos = 0;
}
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
+ buf.in_size = dctx.inbuf_sz;
+ buf.in_pos = dctx.inbuf_pos;
- if (rq->out[no] != rq->in[j])
- continue;
-
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
- rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
- xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
- DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
- DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+ xz_err = xz_dec_microlzma_run(strm->state, &buf);
+ DBG_BUGON(buf.out_pos > buf.out_size);
+ DBG_BUGON(buf.in_pos > buf.in_size);
if (xz_err != XZ_OK) {
- if (xz_err == XZ_STREAM_END && !outlen)
+ if (xz_err == XZ_STREAM_END && !rq->outputsize)
break;
- erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
xz_err, rq->inputsize, rq->outputsize);
err = -EFSCORRUPTED;
break;
}
- }
- if (no < nrpages_out && strm->buf.out)
- kunmap(rq->out[no]);
- if (ni < nrpages_in)
- kunmap(rq->in[ni]);
+ } while (1);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+ kunmap_local(dctx.kin);
/* 4. push back LZMA stream context to the global list */
spin_lock(&z_erofs_lzma_lock);
strm->next = z_erofs_lzma_head;
@@ -292,3 +231,11 @@ again:
wake_up(&z_erofs_lzma_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_lzma_decomp = {
+ .config = z_erofs_load_lzma_config,
+ .decompress = z_erofs_lzma_decompress,
+ .init = z_erofs_lzma_init,
+ .exit = z_erofs_lzma_exit,
+ .name = "lzma"
+};
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
new file mode 100644
index 000000000000..7e177304967e
--- /dev/null
+++ b/fs/erofs/decompressor_zstd.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/zstd.h>
+#include "compress.h"
+
+struct z_erofs_zstd {
+ struct z_erofs_zstd *next;
+ u8 bounce[PAGE_SIZE];
+ void *wksp;
+ unsigned int wkspsz;
+};
+
+static DEFINE_SPINLOCK(z_erofs_zstd_lock);
+static unsigned int z_erofs_zstd_max_dictsize;
+static unsigned int z_erofs_zstd_nstrms, z_erofs_zstd_avail_strms;
+static struct z_erofs_zstd *z_erofs_zstd_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_zstd_wq);
+
+module_param_named(zstd_streams, z_erofs_zstd_nstrms, uint, 0444);
+
+static struct z_erofs_zstd *z_erofs_isolate_strms(bool all)
+{
+ struct z_erofs_zstd *strm;
+
+again:
+ spin_lock(&z_erofs_zstd_lock);
+ strm = z_erofs_zstd_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_zstd_lock);
+ wait_event(z_erofs_zstd_wq, READ_ONCE(z_erofs_zstd_head));
+ goto again;
+ }
+ z_erofs_zstd_head = all ? NULL : strm->next;
+ spin_unlock(&z_erofs_zstd_lock);
+ return strm;
+}
+
+static void z_erofs_zstd_exit(void)
+{
+ while (z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *strm, *n;
+
+ for (strm = z_erofs_isolate_strms(true); strm; strm = n) {
+ n = strm->next;
+
+ kvfree(strm->wksp);
+ kfree(strm);
+ --z_erofs_zstd_avail_strms;
+ }
+ }
+}
+
+static int __init z_erofs_zstd_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_zstd_nstrms)
+ z_erofs_zstd_nstrms = num_possible_cpus();
+
+ for (; z_erofs_zstd_avail_strms < z_erofs_zstd_nstrms;
+ ++z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm) {
+ z_erofs_zstd_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_zstd_lock);
+ strm->next = z_erofs_zstd_head;
+ z_erofs_zstd_head = strm;
+ spin_unlock(&z_erofs_zstd_lock);
+ }
+ return 0;
+}
+
+static int z_erofs_load_zstd_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ static DEFINE_MUTEX(zstd_resize_mutex);
+ struct z_erofs_zstd_cfgs *zstd = data;
+ unsigned int dict_size, wkspsz;
+ struct z_erofs_zstd *strm, *head = NULL;
+ void *wksp;
+
+ if (!zstd || size < sizeof(struct z_erofs_zstd_cfgs) || zstd->format) {
+ erofs_err(sb, "unsupported zstd format, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (zstd->windowlog > ilog2(Z_EROFS_ZSTD_MAX_DICT_SIZE) - 10) {
+ erofs_err(sb, "unsupported zstd window log %u", zstd->windowlog);
+ return -EINVAL;
+ }
+ dict_size = 1U << (zstd->windowlog + 10);
+
+ /* in case 2 z_erofs_load_zstd_config() race to avoid deadlock */
+ mutex_lock(&zstd_resize_mutex);
+ if (z_erofs_zstd_max_dictsize >= dict_size) {
+ mutex_unlock(&zstd_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ while (z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *n;
+
+ for (strm = z_erofs_isolate_strms(true); strm; strm = n) {
+ n = strm->next;
+ strm->next = head;
+ head = strm;
+ --z_erofs_zstd_avail_strms;
+ }
+ }
+
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ wkspsz = zstd_dstream_workspace_bound(dict_size);
+ for (strm = head; strm; strm = strm->next) {
+ wksp = kvmalloc(wkspsz, GFP_KERNEL);
+ if (!wksp)
+ break;
+ kvfree(strm->wksp);
+ strm->wksp = wksp;
+ strm->wkspsz = wkspsz;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_zstd_lock);
+ DBG_BUGON(z_erofs_zstd_head);
+ z_erofs_zstd_head = head;
+ spin_unlock(&z_erofs_zstd_lock);
+ z_erofs_zstd_avail_strms = z_erofs_zstd_nstrms;
+ wake_up_all(&z_erofs_zstd_wq);
+ if (!strm)
+ z_erofs_zstd_max_dictsize = dict_size;
+ mutex_unlock(&zstd_resize_mutex);
+ return strm ? -ENOMEM : 0;
+}
+
+static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
+ zstd_in_buffer in_buf = { NULL, 0, 0 };
+ zstd_out_buffer out_buf = { NULL, 0, 0 };
+ struct z_erofs_zstd *strm;
+ zstd_dstream *stream;
+ int zerr, err;
+
+ /* 1. get the exact compressed size */
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
+ if (err) {
+ kunmap_local(dctx.kin);
+ return err;
+ }
+
+ /* 2. get an available ZSTD context */
+ strm = z_erofs_isolate_strms(false);
+
+ /* 3. multi-call decompress */
+ stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz);
+ if (!stream) {
+ err = -EIO;
+ goto failed_zinit;
+ }
+
+ rq->fillgaps = true; /* ZSTD doesn't support NULL output buffer */
+ in_buf.size = min_t(u32, rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= in_buf.size;
+ in_buf.src = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+
+ do {
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ dctx.inbuf_sz = in_buf.size;
+ dctx.inbuf_pos = in_buf.pos;
+ err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
+ (void **)&in_buf.src, pgpl);
+ if (err)
+ break;
+
+ if (out_buf.size == out_buf.pos) {
+ out_buf.size = dctx.avail_out;
+ out_buf.pos = 0;
+ }
+ in_buf.size = dctx.inbuf_sz;
+ in_buf.pos = dctx.inbuf_pos;
+
+ zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
+ if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) {
+ erofs_err(sb, "failed to decompress in[%u] out[%u]: %s",
+ rq->inputsize, rq->outputsize,
+ zerr ? zstd_get_error_name(zerr) : "unexpected end of stream");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ } while (rq->outputsize || out_buf.pos < out_buf.size);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+failed_zinit:
+ kunmap_local(dctx.kin);
+ /* 4. push back ZSTD stream context to the global list */
+ spin_lock(&z_erofs_zstd_lock);
+ strm->next = z_erofs_zstd_head;
+ z_erofs_zstd_head = strm;
+ spin_unlock(&z_erofs_zstd_lock);
+ wake_up(&z_erofs_zstd_wq);
+ return err;
+}
+
+const struct z_erofs_decompressor z_erofs_zstd_decomp = {
+ .config = z_erofs_load_zstd_config,
+ .decompress = z_erofs_zstd_decompress,
+ .init = z_erofs_zstd_init,
+ .exit = z_erofs_zstd_exit,
+ .name = "zstd",
+};
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index b80abec0531a..c3b90abdee37 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -8,19 +8,15 @@
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
void *dentry_blk, struct erofs_dirent *de,
- unsigned int nameoff, unsigned int maxsize)
+ unsigned int nameoff0, unsigned int maxsize)
{
- const struct erofs_dirent *end = dentry_blk + nameoff;
+ const struct erofs_dirent *end = dentry_blk + nameoff0;
while (de < end) {
- const char *de_name;
+ unsigned char d_type = fs_ftype_to_dtype(de->file_type);
+ unsigned int nameoff = le16_to_cpu(de->nameoff);
+ const char *de_name = (char *)dentry_blk + nameoff;
unsigned int de_namelen;
- unsigned char d_type;
-
- d_type = fs_ftype_to_dtype(de->file_type);
-
- nameoff = le16_to_cpu(de->nameoff);
- de_name = (char *)dentry_blk + nameoff;
/* the last dirent in the block? */
if (de + 1 >= end)
@@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = dir->i_sb;
unsigned long bsz = sb->s_blocksize;
- const size_t dirsize = i_size_read(dir);
- unsigned int i = erofs_blknr(sb, ctx->pos);
unsigned int ofs = erofs_blkoff(sb, ctx->pos);
int err = 0;
bool initial = true;
- buf.inode = dir;
- while (ctx->pos < dirsize) {
+ buf.mapping = dir->i_mapping;
+ while (ctx->pos < dir->i_size) {
+ erofs_off_t dbstart = ctx->pos - ofs;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, i, EROFS_KMAP);
+ de = erofs_bread(&buf, dbstart, EROFS_KMAP);
if (IS_ERR(de)) {
erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
- i, EROFS_I(dir)->nid);
+ erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
}
@@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
break;
}
- maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz);
-
+ maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
-
ofs = roundup(ofs, sizeof(struct erofs_dirent));
- ctx->pos = erofs_pos(sb, i) + ofs;
- if (ofs >= nameoff)
- goto skip_this;
+ ctx->pos = dbstart + ofs;
}
err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
if (err)
break;
-skip_this:
- ctx->pos = erofs_pos(sb, i) + maxsize;
- ++i;
+ ctx->pos = dbstart + maxsize;
ofs = 0;
}
erofs_put_metabuf(&buf);
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index a03ec70ba6f2..c8f2ae845bd2 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -288,14 +288,18 @@ struct erofs_dirent {
#define EROFS_NAME_LEN 255
-/* maximum supported size of a physical compression cluster */
+/* maximum supported encoded size of a physical compressed cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+/* maximum supported decoded size of a physical compressed cluster */
+#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024)
+
/* available compression algorithm types (for h_algorithmtype) */
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_DEFLATE = 2,
+ Z_EROFS_COMPRESSION_ZSTD = 3,
Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
@@ -322,6 +326,15 @@ struct z_erofs_deflate_cfgs {
u8 reserved[5];
} __packed;
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_zstd_cfgs {
+ u8 format;
+ u8 windowlog; /* windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN(10) */
+ u8 reserved[4];
+} __packed;
+
+#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
@@ -396,8 +409,7 @@ enum {
Z_EROFS_LCLUSTER_TYPE_MAX
};
-#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2
-#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0
+#define Z_EROFS_LI_LCLUSTER_TYPE_MASK (Z_EROFS_LCLUSTER_TYPE_MAX - 1)
/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
@@ -451,8 +463,6 @@ static inline void erofs_check_ondisk_layout_definitions(void)
sizeof(struct z_erofs_lcluster_index));
BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
- BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) <
- Z_EROFS_LCLUSTER_TYPE_MAX - 1);
/* exclude old compiler versions like gcc 7.5.0 */
BUILD_BUG_ON(__builtin_constant_p(fmh) ?
fmh != cpu_to_le64(1ULL << 63) : 0);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
new file mode 100644
index 000000000000..c865a7a61030
--- /dev/null
+++ b/fs/erofs/fileio.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include "internal.h"
+#include <trace/events/erofs.h>
+
+struct erofs_fileio_rq {
+ struct bio_vec bvecs[BIO_MAX_VECS];
+ struct bio bio;
+ struct kiocb iocb;
+ struct super_block *sb;
+};
+
+struct erofs_fileio {
+ struct erofs_map_blocks map;
+ struct erofs_map_dev dev;
+ struct erofs_fileio_rq *rq;
+};
+
+static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
+{
+ struct erofs_fileio_rq *rq =
+ container_of(iocb, struct erofs_fileio_rq, iocb);
+ struct folio_iter fi;
+
+ if (ret > 0) {
+ if (ret != rq->bio.bi_iter.bi_size) {
+ bio_advance(&rq->bio, ret);
+ zero_fill_bio(&rq->bio);
+ }
+ ret = 0;
+ }
+ if (rq->bio.bi_end_io) {
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
+ rq->bio.bi_end_io(&rq->bio);
+ } else {
+ bio_for_each_folio_all(fi, &rq->bio) {
+ DBG_BUGON(folio_test_uptodate(fi.folio));
+ erofs_onlinefolio_end(fi.folio, ret, false);
+ }
+ }
+ bio_uninit(&rq->bio);
+ kfree(rq);
+}
+
+static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
+{
+ struct iov_iter iter;
+ int ret;
+
+ if (!rq)
+ return;
+ rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ rq->iocb.ki_ioprio = get_current_ioprio();
+ rq->iocb.ki_complete = erofs_fileio_ki_complete;
+ if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) &&
+ rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT)
+ rq->iocb.ki_flags = IOCB_DIRECT;
+ iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
+ rq->bio.bi_iter.bi_size);
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ if (ret != -EIOCBQUEUED)
+ erofs_fileio_ki_complete(&rq->iocb, ret);
+}
+
+static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
+{
+ struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+ rq->iocb.ki_filp = mdev->m_dif->file;
+ rq->sb = mdev->m_sb;
+ return rq;
+}
+
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev)
+{
+ return &erofs_fileio_rq_alloc(mdev)->bio;
+}
+
+void erofs_fileio_submit_bio(struct bio *bio)
+{
+ return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq,
+ bio));
+}
+
+static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
+{
+ struct inode *inode = folio_inode(folio);
+ struct erofs_map_blocks *map = &io->map;
+ unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
+ loff_t pos = folio_pos(folio), ofs;
+ struct iov_iter iter;
+ struct bio_vec bv;
+ int err = 0;
+
+ erofs_onlinefolio_init(folio);
+ while (cur < end) {
+ if (!in_range(pos + cur, map->m_la, map->m_llen)) {
+ map->m_la = pos + cur;
+ map->m_llen = end - cur;
+ err = erofs_map_blocks(inode, map);
+ if (err)
+ break;
+ }
+
+ ofs = folio_pos(folio) + cur - map->m_la;
+ len = min_t(loff_t, map->m_llen - ofs, end - cur);
+ if (map->m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *src;
+
+ src = erofs_read_metabuf(&buf, inode->i_sb,
+ map->m_pa + ofs, EROFS_KMAP);
+ if (IS_ERR(src)) {
+ err = PTR_ERR(src);
+ break;
+ }
+ bvec_set_folio(&bv, folio, len, cur);
+ iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
+ if (copy_to_iter(src, len, &iter) != len) {
+ erofs_put_metabuf(&buf);
+ err = -EIO;
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ } else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, cur + len);
+ attached = 0;
+ } else {
+ if (io->rq && (map->m_pa + ofs != io->dev.m_pa ||
+ map->m_deviceid != io->dev.m_deviceid)) {
+io_retry:
+ erofs_fileio_rq_submit(io->rq);
+ io->rq = NULL;
+ }
+
+ if (!io->rq) {
+ io->dev = (struct erofs_map_dev) {
+ .m_pa = io->map.m_pa + ofs,
+ .m_deviceid = io->map.m_deviceid,
+ };
+ err = erofs_map_dev(inode->i_sb, &io->dev);
+ if (err)
+ break;
+ io->rq = erofs_fileio_rq_alloc(&io->dev);
+ io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;
+ attached = 0;
+ }
+ if (!bio_add_folio(&io->rq->bio, folio, len, cur))
+ goto io_retry;
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
+ io->dev.m_pa += len;
+ }
+ cur += len;
+ }
+ erofs_onlinefolio_end(folio, err, false);
+ return err;
+}
+
+static int erofs_fileio_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fileio io = {};
+ int err;
+
+ trace_erofs_read_folio(folio, true);
+ err = erofs_fileio_scan_folio(&io, folio);
+ erofs_fileio_rq_submit(io.rq);
+ return err;
+}
+
+static void erofs_fileio_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct erofs_fileio io = {};
+ struct folio *folio;
+ int err;
+
+ trace_erofs_readahead(inode, readahead_index(rac),
+ readahead_count(rac), true);
+ while ((folio = readahead_folio(rac))) {
+ err = erofs_fileio_scan_folio(&io, folio);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
+ }
+ erofs_fileio_rq_submit(io.rq);
+}
+
+const struct address_space_operations erofs_fileio_aops = {
+ .read_folio = erofs_fileio_read_folio,
+ .readahead = erofs_fileio_readahead,
+};
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index afc37c9029ce..ce3d8737df85 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -25,9 +25,15 @@ static struct file_system_type erofs_anon_fs_type = {
.kill_sb = kill_anon_super,
};
-struct erofs_fscache_request {
- struct erofs_fscache_request *primary;
- struct netfs_cache_resources cache_resources;
+struct erofs_fscache_io {
+ struct netfs_cache_resources cres;
+ struct iov_iter iter;
+ netfs_io_terminated_t end_io;
+ void *private;
+ refcount_t ref;
+};
+
+struct erofs_fscache_rq {
struct address_space *mapping; /* The mapping being accessed */
loff_t start; /* Start position */
size_t len; /* Length of the request */
@@ -36,44 +42,17 @@ struct erofs_fscache_request {
refcount_t ref;
};
-static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
- loff_t start, size_t len)
+static bool erofs_fscache_io_put(struct erofs_fscache_io *io)
{
- struct erofs_fscache_request *req;
-
- req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->mapping = mapping;
- req->start = start;
- req->len = len;
- refcount_set(&req->ref, 1);
-
- return req;
-}
-
-static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
- size_t len)
-{
- struct erofs_fscache_request *req;
-
- /* use primary request for the first submission */
- if (!primary->submitted) {
- refcount_inc(&primary->ref);
- return primary;
- }
-
- req = erofs_fscache_req_alloc(primary->mapping,
- primary->start + primary->submitted, len);
- if (!IS_ERR(req)) {
- req->primary = primary;
- refcount_inc(&primary->ref);
- }
- return req;
+ if (!refcount_dec_and_test(&io->ref))
+ return false;
+ if (io->cres.ops)
+ io->cres.ops->end_operation(&io->cres);
+ kfree(io);
+ return true;
}
-static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
+static void erofs_fscache_req_complete(struct erofs_fscache_rq *req)
{
struct folio *folio;
bool failed = req->error;
@@ -93,120 +72,196 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
rcu_read_unlock();
}
-static void erofs_fscache_req_put(struct erofs_fscache_request *req)
+static void erofs_fscache_req_put(struct erofs_fscache_rq *req)
{
- if (refcount_dec_and_test(&req->ref)) {
- if (req->cache_resources.ops)
- req->cache_resources.ops->end_operation(&req->cache_resources);
- if (!req->primary)
- erofs_fscache_req_complete(req);
- else
- erofs_fscache_req_put(req->primary);
- kfree(req);
- }
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ erofs_fscache_req_complete(req);
+ kfree(req);
+}
+
+static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
+ return req;
+}
+
+static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
+{
+ struct erofs_fscache_rq *req = io->private;
+
+ if (erofs_fscache_io_put(io))
+ erofs_fscache_req_put(req);
}
-static void erofs_fscache_subreq_complete(void *priv,
+static void erofs_fscache_req_end_io(void *priv,
ssize_t transferred_or_error, bool was_async)
{
- struct erofs_fscache_request *req = priv;
+ struct erofs_fscache_io *io = priv;
+ struct erofs_fscache_rq *req = io->private;
- if (IS_ERR_VALUE(transferred_or_error)) {
- if (req->primary)
- req->primary->error = transferred_or_error;
- else
- req->error = transferred_or_error;
- }
- erofs_fscache_req_put(req);
+ if (IS_ERR_VALUE(transferred_or_error))
+ req->error = transferred_or_error;
+ erofs_fscache_req_io_put(io);
+}
+
+static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req)
+{
+ struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL);
+
+ if (!io)
+ return NULL;
+ io->end_io = erofs_fscache_req_end_io;
+ io->private = req;
+ refcount_inc(&req->ref);
+ refcount_set(&io->ref, 1);
+ return io;
}
/*
- * Read data from fscache (cookie, pstart, len), and fill the read data into
- * page cache described by (req->mapping, lstart, len). @pstart describeis the
- * start physical address in the cache file.
+ * Read data from fscache described by cookie at pstart physical address
+ * offset, and fill the read data into buffer described by io->iter.
*/
-static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct erofs_fscache_request *req, loff_t pstart, size_t len)
+static int erofs_fscache_read_io_async(struct fscache_cookie *cookie,
+ loff_t pstart, struct erofs_fscache_io *io)
{
enum netfs_io_source source;
- struct super_block *sb = req->mapping->host->i_sb;
- struct netfs_cache_resources *cres = &req->cache_resources;
- struct iov_iter iter;
- loff_t lstart = req->start + req->submitted;
- size_t done = 0;
+ struct netfs_cache_resources *cres = &io->cres;
+ struct iov_iter *iter = &io->iter;
int ret;
- DBG_BUGON(len > req->len - req->submitted);
-
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
return ret;
- while (done < len) {
- loff_t sstart = pstart + done;
- size_t slen = len - done;
+ while (iov_iter_count(iter)) {
+ size_t orig_count = iov_iter_count(iter), len = orig_count;
unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
source = cres->ops->prepare_ondemand_read(cres,
- sstart, &slen, LLONG_MAX, &flags, 0);
- if (WARN_ON(slen == 0))
+ pstart, &len, LLONG_MAX, &flags, 0);
+ if (WARN_ON(len == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ erofs_err(NULL, "prepare_ondemand_read failed (source %d)", source);
return -EIO;
}
- refcount_inc(&req->ref);
- iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
- lstart + done, slen);
-
- ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
- erofs_fscache_subreq_complete, req);
+ iov_iter_truncate(iter, len);
+ refcount_inc(&io->ref);
+ ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL,
+ io->end_io, io);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
- erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ erofs_err(NULL, "fscache_read failed (ret %d)", ret);
return ret;
}
+ if (WARN_ON(iov_iter_count(iter)))
+ return -EIO;
- done += slen;
+ iov_iter_reexpand(iter, orig_count - len);
+ pstart += len;
}
- DBG_BUGON(done != len);
return 0;
}
-static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+struct erofs_fscache_bio {
+ struct erofs_fscache_io io;
+ struct bio bio; /* w/o bdev to share bio_add_page/endio() */
+ struct bio_vec bvecs[BIO_MAX_VECS];
+};
+
+static void erofs_fscache_bio_endio(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct erofs_fscache_bio *io = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ io->bio.bi_status = errno_to_blk_status(transferred_or_error);
+ io->bio.bi_end_io(&io->bio);
+ BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0);
+ erofs_fscache_io_put(&io->io);
+}
+
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
{
+ struct erofs_fscache_bio *io;
+
+ io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
+ bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+ io->io.private = mdev->m_dif->fscache->cookie;
+ io->io.end_io = erofs_fscache_bio_endio;
+ refcount_set(&io->io.ref, 1);
+ return &io->bio;
+}
+
+void erofs_fscache_submit_bio(struct bio *bio)
+{
+ struct erofs_fscache_bio *io = container_of(bio,
+ struct erofs_fscache_bio, bio);
int ret;
- struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private;
- struct erofs_fscache_request *req;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt,
+ bio->bi_iter.bi_size);
+ ret = erofs_fscache_read_io_async(io->io.private,
+ bio->bi_iter.bi_sector << 9, &io->io);
+ erofs_fscache_io_put(&io->io);
+ if (!ret)
+ return;
+ bio->bi_status = errno_to_blk_status(ret);
+ bio->bi_end_io(bio);
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
+ struct erofs_fscache *ctx = folio->mapping->host->i_private;
+ int ret = -ENOMEM;
+ struct erofs_fscache_rq *req;
+ struct erofs_fscache_io *io;
+
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return ret;
}
- ret = erofs_fscache_read_folios_async(ctx->cookie, req,
- folio_pos(folio), folio_size(folio));
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io) {
+ req->error = ret;
+ goto out;
+ }
+ iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages,
+ folio_pos(folio), folio_size(folio));
+
+ ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io);
if (ret)
req->error = ret;
+ erofs_fscache_req_io_put(io);
+out:
erofs_fscache_req_put(req);
return ret;
}
-static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
{
- struct address_space *mapping = primary->mapping;
+ struct address_space *mapping = req->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
- struct erofs_fscache_request *req;
+ struct erofs_fscache_io *io;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
- struct iov_iter iter;
- loff_t pos = primary->start + primary->submitted;
+ loff_t pos = req->start + req->submitted;
size_t count;
int ret;
@@ -217,35 +272,32 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (map.m_flags & EROFS_MAP_META) {
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- erofs_blk_t blknr;
- size_t offset, size;
+ struct iov_iter iter;
+ size_t size = map.m_llen;
void *src;
- /* For tail packing layout, the offset may be non-zero. */
- offset = erofs_blkoff(sb, map.m_pa);
- blknr = erofs_blknr(sb, map.m_pa);
- size = map.m_llen;
-
- src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
if (IS_ERR(src))
return PTR_ERR(src);
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
- if (copy_to_iter(src + offset, size, &iter) != size) {
+ if (copy_to_iter(src, size, &iter) != size) {
erofs_put_metabuf(&buf);
return -EFAULT;
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- primary->submitted += PAGE_SIZE;
+ req->submitted += PAGE_SIZE;
return 0;
}
- count = primary->len - primary->submitted;
+ count = req->len - req->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ struct iov_iter iter;
+
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- primary->submitted += count;
+ req->submitted += count;
return 0;
}
@@ -260,18 +312,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (ret)
return ret;
- req = erofs_fscache_req_chain(primary, count);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io)
+ return -ENOMEM;
+ iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
+ ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie,
+ mdev.m_pa + (pos - map.m_la), io);
+ erofs_fscache_req_io_put(io);
- ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- req, mdev.m_pa + (pos - map.m_la), count);
- erofs_fscache_req_put(req);
- primary->submitted += count;
+ req->submitted += count;
return ret;
}
-static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+static int erofs_fscache_data_read(struct erofs_fscache_rq *req)
{
int ret;
@@ -280,20 +333,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req)
if (ret)
req->error = ret;
} while (!ret && req->submitted < req->len);
-
return ret;
}
static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
int ret;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return -ENOMEM;
}
ret = erofs_fscache_data_read(req);
@@ -303,14 +355,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
if (!readahead_count(rac))
return;
req = erofs_fscache_req_alloc(rac->mapping,
readahead_pos(rac), readahead_length(rac));
- if (IS_ERR(req))
+ if (!req)
return;
/* The request completion will drop refs on the folios. */
@@ -473,7 +525,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
inode->i_blkbits = EROFS_SB(sb)->blkszbits;
inode->i_private = ctx;
@@ -605,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
if (IS_ERR(fscache))
return PTR_ERR(fscache);
- sbi->s_fscache = fscache;
+ sbi->dif0.fscache = fscache;
return 0;
}
@@ -613,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- erofs_fscache_unregister_cookie(sbi->s_fscache);
+ erofs_fscache_unregister_cookie(sbi->dif0.fscache);
if (sbi->domain)
erofs_fscache_domain_put(sbi->domain);
else
fscache_relinquish_volume(sbi->volume, NULL, false);
- sbi->s_fscache = NULL;
+ sbi->dif0.fscache = NULL;
sbi->volume = NULL;
sbi->domain = NULL;
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 9e40bee3682f..db29190656eb 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,39 +5,54 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
-
#include <trace/events/erofs.h>
-static void *erofs_read_inode(struct erofs_buf *buf,
- struct inode *inode, unsigned int *ofs)
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
+ unsigned int m_pofs)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ loff_t off;
+
+ m_pofs += vi->xattr_isize;
+ /* check if it cannot be handled with fast symlink scheme */
+ if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+ check_add_overflow(m_pofs, inode->i_size, &off) ||
+ off > i_blocksize(inode))
+ return 0;
+
+ inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL);
+ return inode->i_link ? 0 : -ENOMEM;
+}
+
+static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_inode *vi = EROFS_I(inode);
const erofs_off_t inode_loc = erofs_iloc(inode);
-
erofs_blk_t blkaddr, nblks = 0;
void *kaddr;
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
- unsigned int ifmt;
- int err;
+ union erofs_inode_i_u iu;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int ifmt, ofs;
+ int err = 0;
blkaddr = erofs_blknr(sb, inode_loc);
- *ofs = erofs_blkoff(sb, inode_loc);
+ ofs = erofs_blkoff(sb, inode_loc);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
vi->nid, PTR_ERR(kaddr));
- return kaddr;
+ return PTR_ERR(kaddr);
}
- dic = kaddr + *ofs;
+ dic = kaddr + ofs;
ifmt = le16_to_cpu(dic->i_format);
-
if (ifmt & ~EROFS_I_ALL) {
- erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ erofs_err(sb, "unsupported i_format %u of nid %llu",
ifmt, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -45,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
- erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+ erofs_err(sb, "unsupported datalayout %u of nid %llu",
vi->datalayout, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -55,119 +70,105 @@ static void *erofs_read_inode(struct erofs_buf *buf,
case EROFS_INODE_LAYOUT_EXTENDED:
vi->inode_isize = sizeof(struct erofs_inode_extended);
/* check if the extended inode acrosses block boundary */
- if (*ofs + vi->inode_isize <= sb->s_blocksize) {
- *ofs += vi->inode_isize;
+ if (ofs + vi->inode_isize <= sb->s_blocksize) {
+ ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
} else {
- const unsigned int gotten = sb->s_blocksize - *ofs;
+ const unsigned int gotten = sb->s_blocksize - ofs;
- copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
err = -ENOMEM;
goto err_out;
}
memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1),
EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
vi->nid, PTR_ERR(kaddr));
kfree(copied);
- return kaddr;
+ return PTR_ERR(kaddr);
}
- *ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, *ofs);
+ ofs = vi->inode_isize - gotten;
+ memcpy((u8 *)copied + gotten, kaddr, ofs);
die = copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(die->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ iu = die->i_u;
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
-
- /* extended inode has its own timestamp */
+ /* each extended inode has its own timestamp */
inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
-
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(die->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- /* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(die->i_u.c.format);
kfree(copied);
- copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
- *ofs += vi->inode_isize;
+ ofs += vi->inode_isize;
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(dic->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ iu = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink));
-
/* use build time for compact inodes */
inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(dic->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
- erofs_err(inode->i_sb,
- "unsupported on-disk inode version %u of nid %llu",
+ erofs_err(sb, "unsupported on-disk inode version %u of nid %llu",
erofs_inode_version(ifmt), vi->nid);
err = -EOPNOTSUPP;
goto err_out;
}
- if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (unlikely(inode->i_size < 0)) {
+ erofs_err(sb, "negative i_size @ nid %llu", vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ if(S_ISLNK(inode->i_mode)) {
+ err = erofs_fill_symlink(inode, kaddr, ofs);
+ if (err)
+ goto err_out;
+ }
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode,
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+
+ /* total blocks for compressed files */
+ if (erofs_inode_is_data_compressed(vi->datalayout)) {
+ nblks = le32_to_cpu(iu.compressed_blocks);
+ } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(iu.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
- erofs_err(inode->i_sb,
- "unsupported chunk format %x of nid %llu",
+ erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -175,7 +176,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
@@ -188,61 +190,23 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
else
inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
- return kaddr;
-
-bogusimode:
- erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
- inode->i_mode, vi->nid);
- err = -EFSCORRUPTED;
err_out:
- DBG_BUGON(1);
- kfree(copied);
- erofs_put_metabuf(buf);
- return ERR_PTR(err);
-}
-
-static int erofs_fill_symlink(struct inode *inode, void *kaddr,
- unsigned int m_pofs)
-{
- struct erofs_inode *vi = EROFS_I(inode);
- loff_t off;
- char *lnk;
-
- m_pofs += vi->xattr_isize;
- /* check if it cannot be handled with fast symlink scheme */
- if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 ||
- check_add_overflow(m_pofs, inode->i_size, &off) ||
- off > i_blocksize(inode)) {
- inode->i_op = &erofs_symlink_iops;
- return 0;
- }
-
- lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
- if (!lnk)
- return -ENOMEM;
-
- memcpy(lnk, kaddr + m_pofs, inode->i_size);
- lnk[inode->i_size] = '\0';
-
- inode->i_link = lnk;
- inode->i_op = &erofs_fast_symlink_iops;
- return 0;
+ DBG_BUGON(err);
+ erofs_put_metabuf(&buf);
+ return err;
}
static int erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
- unsigned int ofs;
- int err = 0;
+ int err;
trace_erofs_fill_inode(inode);
/* read inode base data from disk */
- kaddr = erofs_read_inode(&buf, inode, &ofs);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
+ err = erofs_read_inode(inode);
+ if (err)
+ return err;
/* setup the new inode */
switch (inode->i_mode & S_IFMT) {
@@ -259,9 +223,10 @@ static int erofs_fill_inode(struct inode *inode)
inode_nohighmem(inode);
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, kaddr, ofs);
- if (err)
- goto out_unlock;
+ if (inode->i_link)
+ inode->i_op = &erofs_fast_symlink_iops;
+ else
+ inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
case S_IFCHR:
@@ -270,33 +235,33 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFSOCK:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
- goto out_unlock;
+ return 0;
default:
- err = -EFSCORRUPTED;
- goto out_unlock;
+ return -EFSCORRUPTED;
}
+ mapping_set_large_folios(inode->i_mapping);
if (erofs_inode_is_data_compressed(vi->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
- if (!erofs_is_fscache_mode(inode->i_sb) &&
- inode->i_sb->s_blocksize_bits == PAGE_SHIFT) {
- inode->i_mapping->a_ops = &z_erofs_aops;
- err = 0;
- goto out_unlock;
- }
-#endif
+ DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
+ erofs_info, inode->i_sb,
+ "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
+ inode->i_mapping->a_ops = &z_erofs_aops;
+#else
err = -EOPNOTSUPP;
- goto out_unlock;
- }
- inode->i_mapping->a_ops = &erofs_raw_access_aops;
- mapping_set_large_folios(inode->i_mapping);
+#endif
+ } else {
+ inode->i_mapping->a_ops = &erofs_aops;
#ifdef CONFIG_EROFS_FS_ONDEMAND
- if (erofs_is_fscache_mode(inode->i_sb))
- inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb)))
+ inode->i_mapping->a_ops = &erofs_fileio_aops;
#endif
+ }
-out_unlock:
- erofs_put_metabuf(&buf);
return err;
}
@@ -353,14 +318,29 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
+ bool compressed =
+ erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
- if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+ if (compressed)
stat->attributes |= STATX_ATTR_COMPRESSED;
-
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
+ /*
+ * Return the DIO alignment restrictions if requested.
+ *
+ * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and
+ * compressed files, so in these cases we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN;
+ if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) {
+ stat->dio_mem_align =
+ bdev_logical_block_size(inode->i_sb->s_bdev);
+ stat->dio_offset_align = stat->dio_mem_align;
+ }
+ }
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 787cc9ff9029..856463a702b2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -8,8 +8,10 @@
#define __EROFS_INTERNAL_H
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/dcache.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/magic.h>
@@ -47,7 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct bdev_handle *bdev_handle;
+ struct file *file;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -62,15 +64,12 @@ enum {
};
struct erofs_mount_opts {
-#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
/* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
unsigned int sync_decompress;
-
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
-#endif
unsigned int mount_opt;
};
@@ -114,6 +113,7 @@ struct erofs_xattr_prefix_item {
};
struct erofs_sb_info {
+ struct erofs_device_info dif0;
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
@@ -133,10 +133,7 @@ struct erofs_sb_info {
#endif /* CONFIG_EROFS_FS_ZIP */
struct inode *packed_inode;
struct erofs_dev_context *devs;
- struct dax_device *dax_dev;
- u64 dax_part_off;
u64 total_blocks;
- u32 primarydevice_blocks;
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
@@ -172,7 +169,6 @@ struct erofs_sb_info {
/* fscache support */
struct fscache_volume *volume;
- struct erofs_fscache *s_fscache;
struct erofs_domain *domain;
char *fsid;
char *domain_id;
@@ -186,14 +182,21 @@ struct erofs_sb_info {
#define EROFS_MOUNT_POSIX_ACL 0x00000020
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
+#define EROFS_MOUNT_DIRECT_IO 0x00000100
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file;
+}
+
static inline bool erofs_is_fscache_mode(struct super_block *sb)
{
- return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) &&
+ !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev;
}
enum {
@@ -202,28 +205,21 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-/* basic unit of the workstation of a super_block */
-struct erofs_workgroup {
- pgoff_t index;
- struct lockref lockref;
-};
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
};
struct erofs_buf {
- struct inode *inode;
+ struct address_space *mapping;
+ struct file *file;
struct page *page;
void *base;
enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define ROOT_NID(sb) ((sb)->root_nid)
-
-#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits)
+#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
@@ -281,13 +277,8 @@ struct erofs_inode {
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
unsigned long z_tailextent_headlcn;
- union {
- struct {
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
- };
- erofs_off_t z_fragmentoff;
- };
+ erofs_off_t z_fragmentoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -315,17 +306,13 @@ static inline unsigned int erofs_inode_datalayout(unsigned int ifmt)
return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK;
}
-/*
- * Different from grab_cache_page_nowait(), reclaiming is never triggered
- * when allocating new pages.
- */
-static inline
-struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
- pgoff_t index)
+/* reclaiming is never triggered when allocating new folios. */
+static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
+ pgoff_t index)
{
- return pagecache_get_page(mapping, index,
+ return __filemap_get_folio(as, index,
FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
- readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+ readahead_gfp_mask(as) & ~__GFP_RECLAIM);
}
/* Has a disk mapping */
@@ -337,10 +324,12 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED 0x0008
/* Located in the special packed inode */
-#define EROFS_MAP_FRAGMENT 0x0010
+#define __EROFS_MAP_FRAGMENT 0x0010
/* The extent refers to partial decompressed data */
#define EROFS_MAP_PARTIAL_REF 0x0020
+#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT)
+
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -369,10 +358,9 @@ enum {
};
struct erofs_map_dev {
- struct erofs_fscache *m_fscache;
+ struct super_block *m_sb;
+ struct erofs_device_info *m_dif;
struct block_device *m_bdev;
- struct dax_device *m_daxdev;
- u64 m_dax_part_off;
erofs_off_t m_pa;
unsigned int m_deviceid;
@@ -380,7 +368,8 @@ struct erofs_map_dev {
extern const struct super_operations erofs_sops;
-extern const struct address_space_operations erofs_raw_access_aops;
+extern const struct address_space_operations erofs_aops;
+extern const struct address_space_operations erofs_fileio_aops;
extern const struct address_space_operations z_erofs_aops;
extern const struct address_space_operations erofs_fscache_access_aops;
@@ -402,15 +391,18 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type);
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type);
+ erofs_off_t offset, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
+void erofs_onlinefolio_init(struct folio *folio);
+void erofs_onlinefolio_split(struct folio *folio);
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
@@ -438,7 +430,11 @@ void erofs_unregister_sysfs(struct super_block *sb);
int __init erofs_init_sysfs(void);
void erofs_exit_sysfs(void);
-struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv);
+static inline struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
+{
+ return __erofs_allocpage(pagepool, gfp, false);
+}
static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
{
set_page_private(page, (unsigned long)*pagepool);
@@ -447,56 +443,41 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-void erofs_workgroup_put(struct erofs_workgroup *grp);
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index);
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
-int __init z_erofs_init_zip_subsystem(void);
-void z_erofs_exit_zip_subsystem(void);
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
+int __init z_erofs_init_subsystem(void);
+void z_erofs_exit_subsystem(void);
+int z_erofs_init_super(struct super_block *sb);
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
-void *erofs_get_pcpubuf(unsigned int requiredpages);
-void erofs_put_pcpubuf(void *ptr);
-int erofs_pcpubuf_growsize(unsigned int nrpages);
-void __init erofs_pcpubuf_init(void);
-void erofs_pcpubuf_exit(void);
-int erofs_init_managed_cache(struct super_block *sb);
+void *z_erofs_get_gbuf(unsigned int requiredpages);
+void z_erofs_put_gbuf(void *ptr);
+int z_erofs_gbuf_growsize(unsigned int nrpages);
+int __init z_erofs_gbuf_init(void);
+void z_erofs_gbuf_exit(void);
int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
-static inline int z_erofs_init_zip_subsystem(void) { return 0; }
-static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline void erofs_pcpubuf_init(void) {}
-static inline void erofs_pcpubuf_exit(void) {}
-static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+static inline int z_erofs_init_subsystem(void) { return 0; }
+static inline void z_erofs_exit_subsystem(void) {}
+static inline int z_erofs_init_super(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
-#ifdef CONFIG_EROFS_FS_ZIP_LZMA
-int __init z_erofs_lzma_init(void);
-void z_erofs_lzma_exit(void);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fileio_submit_bio(struct bio *bio);
#else
-static inline int z_erofs_lzma_init(void) { return 0; }
-static inline int z_erofs_lzma_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
-
-#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
-int __init z_erofs_deflate_init(void);
-void z_erofs_deflate_exit(void);
-#else
-static inline int z_erofs_deflate_init(void) { return 0; }
-static inline int z_erofs_deflate_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
+static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fileio_submit_bio(struct bio *bio) {}
+#endif
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
@@ -505,6 +486,8 @@ void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
char *name, unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fscache_submit_bio(struct bio *bio);
#else
static inline int erofs_fscache_register_fs(struct super_block *sb)
{
@@ -522,6 +505,8 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
{
}
+static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index f0110a78acb2..c94d0c1608a8 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
- buf.inode = dir;
- de = erofs_bread(&buf, mid, EROFS_KMAP);
+ buf.mapping = dir->i_mapping;
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
@@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
qn.name = name->name;
qn.end = name->name + name->len;
- buf.inode = dir;
+ buf.mapping = dir->i_mapping;
ndirents = 0;
de = erofs_find_target_block(&buf, dir, &qn, &ndirents);
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
deleted file mode 100644
index c7a4b1d77069..000000000000
--- a/fs/erofs/pcpubuf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) Gao Xiang <xiang@kernel.org>
- *
- * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
- * per-CPU virtual memory (in pages) in advance to store such inplace I/O
- * data if inplace decompression is failed (due to unmet inplace margin for
- * example).
- */
-#include "internal.h"
-
-struct erofs_pcpubuf {
- raw_spinlock_t lock;
- void *ptr;
- struct page **pages;
- unsigned int nrpages;
-};
-
-static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
-
-void *erofs_get_pcpubuf(unsigned int requiredpages)
- __acquires(pcb->lock)
-{
- struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
-
- raw_spin_lock(&pcb->lock);
- /* check if the per-CPU buffer is too small */
- if (requiredpages > pcb->nrpages) {
- raw_spin_unlock(&pcb->lock);
- put_cpu_var(erofs_pcb);
- /* (for sparse checker) pretend pcb->lock is still taken */
- __acquire(pcb->lock);
- return NULL;
- }
- return pcb->ptr;
-}
-
-void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
-{
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
-
- DBG_BUGON(pcb->ptr != ptr);
- raw_spin_unlock(&pcb->lock);
- put_cpu_var(erofs_pcb);
-}
-
-/* the next step: support per-CPU page buffers hotplug */
-int erofs_pcpubuf_growsize(unsigned int nrpages)
-{
- static DEFINE_MUTEX(pcb_resize_mutex);
- static unsigned int pcb_nrpages;
- struct page *pagepool = NULL;
- int delta, cpu, ret, i;
-
- mutex_lock(&pcb_resize_mutex);
- delta = nrpages - pcb_nrpages;
- ret = 0;
- /* avoid shrinking pcpubuf, since no idea how many fses rely on */
- if (delta <= 0)
- goto out;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
- struct page **pages, **oldpages;
- void *ptr, *old_ptr;
-
- pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- break;
- }
-
- for (i = 0; i < nrpages; ++i) {
- pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
- if (!pages[i]) {
- ret = -ENOMEM;
- oldpages = pages;
- goto free_pagearray;
- }
- }
- ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
- if (!ptr) {
- ret = -ENOMEM;
- oldpages = pages;
- goto free_pagearray;
- }
- raw_spin_lock(&pcb->lock);
- old_ptr = pcb->ptr;
- pcb->ptr = ptr;
- oldpages = pcb->pages;
- pcb->pages = pages;
- i = pcb->nrpages;
- pcb->nrpages = nrpages;
- raw_spin_unlock(&pcb->lock);
-
- if (!oldpages) {
- DBG_BUGON(old_ptr);
- continue;
- }
-
- if (old_ptr)
- vunmap(old_ptr);
-free_pagearray:
- while (i)
- erofs_pagepool_add(&pagepool, oldpages[--i]);
- kfree(oldpages);
- if (ret)
- break;
- }
- pcb_nrpages = nrpages;
- erofs_release_pages(&pagepool);
-out:
- mutex_unlock(&pcb_resize_mutex);
- return ret;
-}
-
-void __init erofs_pcpubuf_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
- raw_spin_lock_init(&pcb->lock);
- }
-}
-
-void erofs_pcpubuf_exit(void)
-{
- int cpu, i;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
- if (pcb->ptr) {
- vunmap(pcb->ptr);
- pcb->ptr = NULL;
- }
- if (!pcb->pages)
- continue;
-
- for (i = 0; i < pcb->nrpages; ++i)
- if (pcb->pages[i])
- put_page(pcb->pages[i]);
- kfree(pcb->pages);
- pcb->pages = NULL;
- }
-}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 113414e6f35b..5fcdab614517 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -4,15 +4,13 @@
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
-#include <linux/module.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/crc32c.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
-#include <linux/dax.h>
#include <linux/exportfs.h>
+#include <linux/backing-dev.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -30,7 +28,10 @@ void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ if (sb)
+ pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ else
+ pr_err("%s: %pV", func, &vaf);
va_end(args);
}
@@ -44,7 +45,10 @@ void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_info("(device %s): %pV", sb->s_id, &vaf);
+ if (sb)
+ pr_info("(device %s): %pV", sb->s_id, &vaf);
+ else
+ pr_info("%pV", &vaf);
va_end(args);
}
@@ -105,22 +109,6 @@ static void erofs_free_inode(struct inode *inode)
kmem_cache_free(erofs_inode_cachep, vi);
}
-static bool check_layout_compatibility(struct super_block *sb,
- struct erofs_super_block *dsb)
-{
- const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
-
- EROFS_SB(sb)->feature_incompat = feature;
-
- /* check if current kernel meets all mandatory requirements */
- if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
- erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
- feature & ~EROFS_ALL_FEATURE_INCOMPAT);
- return false;
- }
- return true;
-}
-
/* read variable-sized metadata, offset will be aligned by 4-byte */
void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp)
@@ -129,11 +117,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr))
return ptr;
- len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]);
+ len = le16_to_cpu(*(__le16 *)ptr);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
@@ -145,12 +133,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
}
- memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt);
+ memcpy(buffer + i, ptr, cnt);
*offset += cnt;
}
return buffer;
@@ -174,13 +162,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct bdev_handle *bdev_handle;
- void *ptr;
+ struct file *file;
- ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
- dis = ptr + erofs_blkoff(sb, *pos);
+ dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
+ if (IS_ERR(dis))
+ return PTR_ERR(dis);
if (!sbi->devs->flatdev && !dif->path) {
if (!dis->tag[0]) {
@@ -198,13 +184,24 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
- sb->s_type, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
- dif->bdev_handle = bdev_handle;
- dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
- &dif->dax_part_off, NULL, NULL);
+ file = erofs_is_fileio_mode(sbi) ?
+ filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
+ bdev_file_open_by_path(dif->path,
+ BLK_OPEN_READ, sb->s_type, NULL);
+ if (IS_ERR(file)) {
+ if (file == ERR_PTR(-ENOTBLK))
+ return -EINVAL;
+ return PTR_ERR(file);
+ }
+
+ if (!erofs_is_fileio_mode(sbi)) {
+ dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
+ &dif->dax_part_off, NULL, NULL);
+ } else if (!S_ISREG(file_inode(file)->i_mode)) {
+ fput(file);
+ return -EINVAL;
+ }
+ dif->file = file;
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -224,7 +221,7 @@ static int erofs_scan_devices(struct super_block *sb,
struct erofs_device_info *dif;
int id, err = 0;
- sbi->total_blocks = sbi->primarydevice_blocks;
+ sbi->total_blocks = sbi->dif0.blocks;
if (!erofs_sb_has_device_table(sbi))
ondisk_extradevs = 0;
else
@@ -278,7 +275,7 @@ static int erofs_scan_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
- struct erofs_sb_info *sbi;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
void *data;
@@ -290,9 +287,7 @@ static int erofs_read_superblock(struct super_block *sb)
return PTR_ERR(data);
}
- sbi = EROFS_SB(sb);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
-
ret = -EINVAL;
if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
erofs_err(sb, "cannot find valid erofs superblock");
@@ -317,8 +312,12 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = -EINVAL;
- if (!check_layout_compatibility(sb, dsb))
+ sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat);
+ if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) {
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
+ sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT);
goto out;
+ }
sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
@@ -326,7 +325,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+ sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -342,7 +341,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->build_time = le64_to_cpu(dsb->build_time);
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
- memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+ super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
ret = strscpy(sbi->volume_name, dsb->volume_name,
sizeof(dsb->volume_name));
@@ -361,7 +360,7 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_scan_devices(sb, dsb);
if (erofs_is_fscache_mode(sb))
- erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+ erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
@@ -383,14 +382,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
}
enum {
- Opt_user_xattr,
- Opt_acl,
- Opt_cache_strategy,
- Opt_dax,
- Opt_dax_enum,
- Opt_device,
- Opt_fsid,
- Opt_domain_id,
+ Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
+ Opt_device, Opt_fsid, Opt_domain_id, Opt_directio,
Opt_err
};
@@ -417,6 +410,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("device", Opt_device),
fsparam_string("fsid", Opt_fsid),
fsparam_string("domain_id", Opt_domain_id),
+ fsparam_flag_no("directio", Opt_directio),
{}
};
@@ -427,7 +421,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
- warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
set_opt(&sbi->opt, DAX_ALWAYS);
clear_opt(&sbi->opt, DAX_NEVER);
return true;
@@ -531,30 +524,68 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
break;
#endif
+ case Opt_directio:
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (result.boolean)
+ set_opt(&sbi->opt, DIRECT_IO);
+ else
+ clear_opt(&sbi->opt, DIRECT_IO);
+#else
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+ break;
default:
return -ENOPARAM;
}
return 0;
}
-static struct inode *erofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
- return erofs_iget(sb, ino);
+ erofs_nid_t nid = EROFS_I(inode)->nid;
+ int len = parent ? 6 : 3;
+
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ fh[0] = (u32)(nid >> 32);
+ fh[1] = (u32)(nid & 0xffffffff);
+ fh[2] = inode->i_generation;
+
+ if (parent) {
+ nid = EROFS_I(parent)->nid;
+
+ fh[3] = (u32)(nid >> 32);
+ fh[4] = (u32)(nid & 0xffffffff);
+ fh[5] = parent->i_generation;
+ }
+
+ *max_len = len;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[0] << 32) | fid->raw[1]));
}
static struct dentry *erofs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[3] << 32) | fid->raw[4]));
}
static struct dentry *erofs_get_parent(struct dentry *child)
@@ -570,11 +601,28 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
+ .encode_fh = erofs_encode_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
};
+static void erofs_set_sysfs_name(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ if (sbi->domain_id)
+ super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id,
+ sbi->fsid);
+ else if (sbi->fsid)
+ super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
+ else if (erofs_is_fileio_mode(sbi))
+ super_set_sysfs_name_generic(sb, "%s",
+ bdi_dev_name(sb->s_bdi));
+ else
+ super_set_sysfs_name_id(sb);
+}
+
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
@@ -587,14 +635,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &erofs_sops;
sbi->blkszbits = PAGE_SHIFT;
- if (erofs_is_fscache_mode(sb)) {
+ if (!sb->s_bdev) {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- err = erofs_fscache_register_fs(sb);
- if (err)
- return err;
-
+ if (erofs_is_fscache_mode(sb)) {
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+ }
err = super_setup_bdi(sb);
if (err)
return err;
@@ -604,9 +653,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -EINVAL;
}
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
- &sbi->dax_part_off,
- NULL, NULL);
+ sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dif0.dax_part_off, NULL, NULL);
}
err = erofs_read_superblock(sb);
@@ -618,14 +666,18 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
errorfc(fc, "unsupported blksize for fscache mode");
return -EINVAL;
}
- if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_blocksize = 1 << sbi->blkszbits;
+ sb->s_blocksize_bits = sbi->blkszbits;
+ } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
errorfc(fc, "failed to set erofs blksize");
return -EINVAL;
}
}
if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- if (!sbi->dax_dev) {
+ if (!sbi->dif0.dax_dev) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
clear_opt(&sbi->opt, DAX_ALWAYS);
} else if (sbi->blkszbits != PAGE_SHIFT) {
@@ -643,58 +695,73 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
else
sb->s_flags &= ~SB_POSIXACL;
-#ifdef CONFIG_EROFS_FS_ZIP
- xa_init(&sbi->managed_pslots);
-#endif
+ err = z_erofs_init_super(sb);
+ if (err)
+ return err;
- inode = erofs_iget(sb, ROOT_NID(sbi));
+ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
+ inode = erofs_iget(sb, sbi->packed_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->packed_inode = inode;
+ }
+
+ inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
return PTR_ERR(inode);
if (!S_ISDIR(inode->i_mode)) {
erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
- ROOT_NID(sbi), inode->i_mode);
+ sbi->root_nid, inode->i_mode);
iput(inode);
return -EINVAL;
}
-
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
erofs_shrinker_register(sb);
- if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
- sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
- if (IS_ERR(sbi->packed_inode)) {
- err = PTR_ERR(sbi->packed_inode);
- sbi->packed_inode = NULL;
- return err;
- }
- }
- err = erofs_init_managed_cache(sb);
- if (err)
- return err;
-
err = erofs_xattr_prefixes_init(sb);
if (err)
return err;
+ erofs_set_sysfs_name(sb);
err = erofs_register_sysfs(sb);
if (err)
return err;
- erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
+ erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
static int erofs_fc_get_tree(struct fs_context *fc)
{
struct erofs_sb_info *sbi = fc->s_fs_info;
+ int ret;
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- return get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
+ IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
+ GET_TREE_BDEV_QUIET_LOOKUP : 0);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (ret == -ENOTBLK) {
+ struct file *file;
+
+ if (!fc->source)
+ return invalf(fc, "No source specified");
+ file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ sbi->dif0.file = file;
+
+ if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) &&
+ sbi->dif0.file->f_mapping->a_ops->read_folio)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+ }
+#endif
+ return ret;
}
static int erofs_fc_reconfigure(struct fs_context *fc)
@@ -724,8 +791,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev_handle)
- bdev_release(dif->bdev_handle);
+ if (dif->file)
+ fput(dif->file);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
@@ -742,19 +809,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs)
kfree(devs);
}
-static void erofs_fc_free(struct fs_context *fc)
+static void erofs_sb_free(struct erofs_sb_info *sbi)
{
- struct erofs_sb_info *sbi = fc->s_fs_info;
-
- if (!sbi)
- return;
-
erofs_free_dev_context(sbi->devs);
kfree(sbi->fsid);
kfree(sbi->domain_id);
+ if (sbi->dif0.file)
+ fput(sbi->dif0.file);
kfree(sbi);
}
+static void erofs_fc_free(struct fs_context *fc)
+{
+ struct erofs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi) /* free here if an error occurs before transferring to sb */
+ erofs_sb_free(sbi);
+}
+
static const struct fs_context_operations erofs_context_ops = {
.parse_param = erofs_fc_parse_param,
.get_tree = erofs_fc_get_tree,
@@ -784,21 +856,29 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
+static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
+{
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+}
+
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
+ if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) ||
+ sbi->dif0.file)
kill_anon_super(sb);
else
kill_block_super(sb);
-
- erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev, NULL);
+ erofs_drop_internal_inodes(sbi);
+ fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
- kfree(sbi->fsid);
- kfree(sbi->domain_id);
- kfree(sbi);
+ erofs_sb_free(sbi);
sb->s_fs_info = NULL;
}
@@ -806,17 +886,10 @@ static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
- DBG_BUGON(!sbi);
-
erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
erofs_xattr_prefixes_cleanup(sb);
-#ifdef CONFIG_EROFS_FS_ZIP
- iput(sbi->managed_cache);
- sbi->managed_cache = NULL;
-#endif
- iput(sbi->packed_inode);
- sbi->packed_inode = NULL;
+ erofs_drop_internal_inodes(sbi);
erofs_free_dev_context(sbi->devs);
sbi->devs = NULL;
erofs_fscache_unregister_fs(sb);
@@ -839,7 +912,7 @@ static int __init erofs_module_init(void)
erofs_inode_cachep = kmem_cache_create("erofs_inode",
sizeof(struct erofs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
erofs_inode_init_once);
if (!erofs_inode_cachep)
return -ENOMEM;
@@ -848,16 +921,7 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
- err = z_erofs_lzma_init();
- if (err)
- goto lzma_err;
-
- err = z_erofs_deflate_init();
- if (err)
- goto deflate_err;
-
- erofs_pcpubuf_init();
- err = z_erofs_init_zip_subsystem();
+ err = z_erofs_init_subsystem();
if (err)
goto zip_err;
@@ -874,12 +938,8 @@ static int __init erofs_module_init(void)
fs_err:
erofs_exit_sysfs();
sysfs_err:
- z_erofs_exit_zip_subsystem();
+ z_erofs_exit_subsystem();
zip_err:
- z_erofs_deflate_exit();
-deflate_err:
- z_erofs_lzma_exit();
-lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -894,34 +954,29 @@ static void __exit erofs_module_exit(void)
rcu_barrier();
erofs_exit_sysfs();
- z_erofs_exit_zip_subsystem();
- z_erofs_deflate_exit();
- z_erofs_lzma_exit();
+ z_erofs_exit_subsystem();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
- erofs_pcpubuf_exit();
}
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = 0;
-
- if (!erofs_is_fscache_mode(sb))
- id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
-
buf->f_files = ULLONG_MAX;
buf->f_ffree = ULLONG_MAX - sbi->inos;
-
buf->f_namelen = EROFS_NAME_LEN;
- buf->f_fsid = u64_to_fsid(id);
+ if (uuid_is_null(&sb->s_uuid))
+ buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 :
+ huge_encode_dev(sb->s_bdev->bd_dev));
+ else
+ buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
return 0;
}
@@ -930,30 +985,20 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
struct erofs_mount_opts *opt = &sbi->opt;
-#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(opt, XATTR_USER))
- seq_puts(seq, ",user_xattr");
- else
- seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(opt, POSIX_ACL))
- seq_puts(seq, ",acl");
- else
- seq_puts(seq, ",noacl");
-#endif
-#ifdef CONFIG_EROFS_FS_ZIP
- if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
- seq_puts(seq, ",cache_strategy=disabled");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
- seq_puts(seq, ",cache_strategy=readahead");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
- seq_puts(seq, ",cache_strategy=readaround");
-#endif
+ if (IS_ENABLED(CONFIG_EROFS_FS_XATTR))
+ seq_puts(seq, test_opt(opt, XATTR_USER) ?
+ ",user_xattr" : ",nouser_xattr");
+ if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
+ seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl");
+ if (IS_ENABLED(CONFIG_EROFS_FS_ZIP))
+ seq_printf(seq, ",cache_strategy=%s",
+ erofs_param_cache_strategy[opt->cache_strategy].name);
if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+ if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO))
+ seq_puts(seq, ",directio");
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (sbi->fsid)
seq_printf(seq, ",fsid=%s", sbi->fsid);
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 435e515c0792..63cffd0fd261 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -205,34 +205,16 @@ static struct kobject erofs_feat = {
int erofs_register_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- char *name;
- char *str = NULL;
int err;
- if (erofs_is_fscache_mode(sb)) {
- if (sbi->domain_id) {
- str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id,
- sbi->fsid);
- if (!str)
- return -ENOMEM;
- name = str;
- } else {
- name = sbi->fsid;
- }
- } else {
- name = sb->s_id;
- }
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
- kfree(str);
- if (err)
- goto put_sb_kobj;
- return 0;
-
-put_sb_kobj:
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
+ sb->s_sysfs_name);
+ if (err) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
return err;
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
deleted file mode 100644
index 4256a85719a1..000000000000
--- a/fs/erofs/utils.c
+++ /dev/null
@@ -1,282 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * https://www.huawei.com/
- */
-#include "internal.h"
-
-struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
-{
- struct page *page = *pagepool;
-
- if (page) {
- DBG_BUGON(page_ref_count(page) != 1);
- *pagepool = (struct page *)page_private(page);
- } else {
- page = alloc_page(gfp);
- }
- return page;
-}
-
-void erofs_release_pages(struct page **pagepool)
-{
- while (*pagepool) {
- struct page *page = *pagepool;
-
- *pagepool = (struct page *)page_private(page);
- put_page(page);
- }
-}
-
-#ifdef CONFIG_EROFS_FS_ZIP
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
-static bool erofs_workgroup_get(struct erofs_workgroup *grp)
-{
- if (lockref_get_not_zero(&grp->lockref))
- return true;
-
- spin_lock(&grp->lockref.lock);
- if (__lockref_is_dead(&grp->lockref)) {
- spin_unlock(&grp->lockref.lock);
- return false;
- }
-
- if (!grp->lockref.count++)
- atomic_long_dec(&erofs_global_shrink_cnt);
- spin_unlock(&grp->lockref.lock);
- return true;
-}
-
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_workgroup *grp;
-
-repeat:
- rcu_read_lock();
- grp = xa_load(&sbi->managed_pslots, index);
- if (grp) {
- if (!erofs_workgroup_get(grp)) {
- /* prefer to relax rcu read side */
- rcu_read_unlock();
- goto repeat;
- }
-
- DBG_BUGON(index != grp->index);
- }
- rcu_read_unlock();
- return grp;
-}
-
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct erofs_workgroup *pre;
-
- DBG_BUGON(grp->lockref.count < 1);
-repeat:
- xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_NOFS);
- if (pre) {
- if (xa_is_err(pre)) {
- pre = ERR_PTR(xa_err(pre));
- } else if (!erofs_workgroup_get(pre)) {
- /* try to legitimize the current in-tree one */
- xa_unlock(&sbi->managed_pslots);
- cond_resched();
- goto repeat;
- }
- grp = pre;
- }
- xa_unlock(&sbi->managed_pslots);
- return grp;
-}
-
-static void __erofs_workgroup_free(struct erofs_workgroup *grp)
-{
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
-}
-
-void erofs_workgroup_put(struct erofs_workgroup *grp)
-{
- if (lockref_put_or_lock(&grp->lockref))
- return;
-
- DBG_BUGON(__lockref_is_dead(&grp->lockref));
- if (grp->lockref.count == 1)
- atomic_long_inc(&erofs_global_shrink_cnt);
- --grp->lockref.count;
- spin_unlock(&grp->lockref.lock);
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
-{
- int free = false;
-
- spin_lock(&grp->lockref.lock);
- if (grp->lockref.count)
- goto out;
-
- /*
- * Note that all cached pages should be detached before deleted from
- * the XArray. Otherwise some cached pages could be still attached to
- * the orphan old workgroup when the new one is available in the tree.
- */
- if (erofs_try_to_free_all_cached_pages(sbi, grp))
- goto out;
-
- /*
- * It's impossible to fail after the workgroup is freezed,
- * however in order to avoid some race conditions, add a
- * DBG_BUGON to observe this in advance.
- */
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
-
- lockref_mark_dead(&grp->lockref);
- free = true;
-out:
- spin_unlock(&grp->lockref.lock);
- if (free)
- __erofs_workgroup_free(grp);
- return free;
-}
-
-static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink)
-{
- struct erofs_workgroup *grp;
- unsigned int freed = 0;
- unsigned long index;
-
- xa_lock(&sbi->managed_pslots);
- xa_for_each(&sbi->managed_pslots, index, grp) {
- /* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp))
- continue;
- xa_unlock(&sbi->managed_pslots);
-
- ++freed;
- if (!--nr_shrink)
- return freed;
- xa_lock(&sbi->managed_pslots);
- }
- xa_unlock(&sbi->managed_pslots);
- return freed;
-}
-
-/* protected by 'erofs_sb_list_lock' */
-static unsigned int shrinker_run_no;
-
-/* protects the mounted 'erofs_sb_list' */
-static DEFINE_SPINLOCK(erofs_sb_list_lock);
-static LIST_HEAD(erofs_sb_list);
-
-void erofs_shrinker_register(struct super_block *sb)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
-
- mutex_init(&sbi->umount_mutex);
-
- spin_lock(&erofs_sb_list_lock);
- list_add(&sbi->list, &erofs_sb_list);
- spin_unlock(&erofs_sb_list_lock);
-}
-
-void erofs_shrinker_unregister(struct super_block *sb)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
-
- mutex_lock(&sbi->umount_mutex);
- /* clean up all remaining workgroups in memory */
- erofs_shrink_workstation(sbi, ~0UL);
-
- spin_lock(&erofs_sb_list_lock);
- list_del(&sbi->list);
- spin_unlock(&erofs_sb_list_lock);
- mutex_unlock(&sbi->umount_mutex);
-}
-
-static unsigned long erofs_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- return atomic_long_read(&erofs_global_shrink_cnt);
-}
-
-static unsigned long erofs_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct erofs_sb_info *sbi;
- struct list_head *p;
-
- unsigned long nr = sc->nr_to_scan;
- unsigned int run_no;
- unsigned long freed = 0;
-
- spin_lock(&erofs_sb_list_lock);
- do {
- run_no = ++shrinker_run_no;
- } while (run_no == 0);
-
- /* Iterate over all mounted superblocks and try to shrink them */
- p = erofs_sb_list.next;
- while (p != &erofs_sb_list) {
- sbi = list_entry(p, struct erofs_sb_info, list);
-
- /*
- * We move the ones we do to the end of the list, so we stop
- * when we see one we have already done.
- */
- if (sbi->shrinker_run_no == run_no)
- break;
-
- if (!mutex_trylock(&sbi->umount_mutex)) {
- p = p->next;
- continue;
- }
-
- spin_unlock(&erofs_sb_list_lock);
- sbi->shrinker_run_no = run_no;
-
- freed += erofs_shrink_workstation(sbi, nr - freed);
-
- spin_lock(&erofs_sb_list_lock);
- /* Get the next list element before we move this one */
- p = p->next;
-
- /*
- * Move this one to the end of the list to provide some
- * fairness.
- */
- list_move_tail(&sbi->list, &erofs_sb_list);
- mutex_unlock(&sbi->umount_mutex);
-
- if (freed >= nr)
- break;
- }
- spin_unlock(&erofs_sb_list_lock);
- return freed;
-}
-
-static struct shrinker erofs_shrinker_info = {
- .scan_objects = erofs_shrink_scan,
- .count_objects = erofs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
-
-int __init erofs_init_shrinker(void)
-{
- return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
-}
-
-void erofs_exit_shrinker(void)
-{
- unregister_shrinker(&erofs_shrinker_info);
-}
-#endif /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 09d341675e89..60d2cf26e837 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ ih = it.kaddr;
vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
@@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos),
- EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)
- (it.kaddr + erofs_blkoff(sb, it.pos)));
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr);
it.pos += sizeof(__le32);
}
erofs_put_metabuf(&it.buf);
@@ -168,7 +166,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
};
#endif
-const struct xattr_handler *erofs_xattr_handlers[] = {
+const struct xattr_handler * const erofs_xattr_handlers[] = {
&erofs_xattr_user_handler,
&erofs_xattr_trusted_handler,
#ifdef CONFIG_EROFS_FS_SECURITY
@@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- src = it->kaddr + erofs_blkoff(sb, it->pos);
+ src = it->kaddr;
slice = min_t(unsigned int, sb->s_blocksize -
erofs_blkoff(sb, it->pos), len - processed);
memcpy(it->buffer + it->buffer_ofs, src, slice);
@@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
int err;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(it->sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
base_index = entry.e_name_index;
@@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
unsigned int slice, processed, value_sz;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
value_sz = le16_to_cpu(entry.e_value_size);
@@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
sb->s_blocksize - erofs_blkoff(sb, it->pos),
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
- it->kaddr + erofs_blkoff(sb, it->pos), slice))
+ it->kaddr, slice))
return -ENOATTR;
it->pos += slice;
}
@@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- entry_sz = erofs_xattr_entry_size(it->kaddr +
- erofs_blkoff(it->sb, it->pos));
+ entry_sz = erofs_xattr_entry_size(it->kaddr);
/* xattr on-disk corruption: xattr entry beyond xattr_isize */
if (remaining < entry_sz) {
DBG_BUGON(1);
@@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
for (i = 0; i < vi->xattr_shared_count; ++i) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -416,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
}
it.index = index;
- it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ it.name = QSTR(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
@@ -492,7 +483,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
return -ENOMEM;
if (sbi->packed_inode)
- buf.inode = sbi->packed_inode;
+ buf.mapping = sbi->packed_inode->i_mapping;
else
erofs_init_metabuf(&buf, sb);
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index f16283cb8c93..b246cd0e135e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
{
const struct xattr_handler *handler = NULL;
- static const struct xattr_handler *xattr_handler_map[] = {
+ static const struct xattr_handler * const xattr_handler_map[] = {
[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
#ifdef CONFIG_EROFS_FS_POSIX_ACL
[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
return xattr_prefix(handler);
}
-extern const struct xattr_handler *erofs_xattr_handlers[];
+extern const struct xattr_handler * const erofs_xattr_handlers[];
int erofs_xattr_prefixes_init(struct super_block *sb);
void erofs_xattr_prefixes_cleanup(struct super_block *sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 1c0e6167d8e7..f35d2eb0ed11 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -12,12 +12,6 @@
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
#define Z_EROFS_INLINE_BVECS 2
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
struct z_erofs_bvec {
struct page *page;
int offset;
@@ -44,11 +38,14 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_pcluster {
- struct erofs_workgroup obj;
struct mutex lock;
+ struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
+ struct z_erofs_pcluster *next;
+
+ /* I: start block address of this pcluster */
+ erofs_off_t index;
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -56,6 +53,9 @@ struct z_erofs_pcluster {
/* L: total number of bvecs */
unsigned int vcnt;
+ /* I: pcluster size (compressed size) in bytes */
+ unsigned int pclustersize;
+
/* I: page offset of start position of decompression */
unsigned short pageofs_out;
@@ -70,22 +70,14 @@ struct z_erofs_pcluster {
struct rcu_head rcu;
};
- union {
- /* I: physical cluster size in pages */
- unsigned short pclusterpages;
-
- /* I: tailpacking inline compressed size */
- unsigned short tailpacking_size;
- };
-
/* I: compression algorithm format */
unsigned char algorithmformat;
/* L: whether partial decompression or not */
bool partial;
- /* L: indicate several pageofs_outs or not */
- bool multibases;
+ /* L: whether extra buffer allocations are best-effort */
+ bool besteffort;
/* A: compressed bvecs (can be cached or inplaced pages) */
struct z_erofs_bvec compressed_bvecs[];
@@ -93,12 +85,11 @@ struct z_erofs_pcluster {
/* the end of a chain of pclusters */
#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
-#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
struct super_block *sb;
+ struct z_erofs_pcluster *head;
atomic_t pending_bios;
- z_erofs_next_pcluster_t head;
union {
struct completion done;
@@ -110,57 +101,18 @@ struct z_erofs_decompressqueue {
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
- return !pcl->obj.index;
+ return !pcl->index;
}
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
{
- if (z_erofs_is_inline_pcluster(pcl))
- return 1;
- return pcl->pclusterpages;
+ return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
-/*
- * bit 30: I/O error occurred on this page
- * bit 0 - 29: remaining parts to complete this page
- */
-#define Z_EROFS_PAGE_EIO (1 << 30)
-
-static inline void z_erofs_onlinepage_init(struct page *page)
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
{
- union {
- atomic_t o;
- unsigned long v;
- } u = { .o = ATOMIC_INIT(1) };
-
- set_page_private(page, u.v);
- smp_wmb();
- SetPagePrivate(page);
-}
-
-static inline void z_erofs_onlinepage_split(struct page *page)
-{
- atomic_inc((atomic_t *)&page->private);
-}
-
-static void z_erofs_onlinepage_endio(struct page *page, int err)
-{
- int orig, v;
-
- DBG_BUGON(!PagePrivate(page));
-
- do {
- orig = atomic_read((atomic_t *)&page->private);
- v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
-
- if (!(v & ~Z_EROFS_PAGE_EIO)) {
- set_page_private(page, 0);
- ClearPagePrivate(page);
- if (!(v & Z_EROFS_PAGE_EIO))
- SetPageUptodate(page);
- unlock_page(page);
- }
+ return fo->mapping == MNGD_MAPPING(sbi);
}
#define Z_EROFS_ONSTACK_PAGES 32
@@ -237,7 +189,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
+ true);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -298,21 +251,21 @@ static int z_erofs_create_pcluster_pool(void)
return 0;
}
-static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
{
- int i;
+ unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool;
- for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
- struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+ for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
struct z_erofs_pcluster *pcl;
if (nrpages > pcs->maxpages)
continue;
- pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
- pcl->pclusterpages = nrpages;
+ pcl->pclustersize = size;
return pcl;
}
return ERR_PTR(-EINVAL);
@@ -451,81 +404,81 @@ static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
#endif
-void z_erofs_exit_zip_subsystem(void)
+void z_erofs_exit_subsystem(void)
{
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool();
+ z_erofs_exit_decompressor();
}
-int __init z_erofs_init_zip_subsystem(void)
+int __init z_erofs_init_subsystem(void)
{
- int err = z_erofs_create_pcluster_pool();
+ int err = z_erofs_init_decompressor();
if (err)
- goto out_error_pcluster_pool;
+ goto err_decompressor;
+
+ err = z_erofs_create_pcluster_pool();
+ if (err)
+ goto err_pcluster_pool;
z_erofs_workqueue = alloc_workqueue("erofs_worker",
WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
if (!z_erofs_workqueue) {
err = -ENOMEM;
- goto out_error_workqueue_init;
+ goto err_workqueue_init;
}
err = erofs_init_percpu_workers();
if (err)
- goto out_error_pcpu_worker;
+ goto err_pcpu_worker;
err = erofs_cpu_hotplug_init();
if (err < 0)
- goto out_error_cpuhp_init;
+ goto err_cpuhp_init;
return err;
-out_error_cpuhp_init:
+err_cpuhp_init:
erofs_destroy_percpu_workers();
-out_error_pcpu_worker:
+err_pcpu_worker:
destroy_workqueue(z_erofs_workqueue);
-out_error_workqueue_init:
+err_workqueue_init:
z_erofs_destroy_pcluster_pool();
-out_error_pcluster_pool:
+err_pcluster_pool:
+ z_erofs_exit_decompressor();
+err_decompressor:
return err;
}
enum z_erofs_pclustermode {
+ /* It has previously been linked into another processing chain */
Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
- * could be dispatched into bypass queue later due to uptodated managed
- * pages. All related online pages cannot be reused for inplace I/O (or
- * bvpage) since it can be directly decoded without I/O submission.
+ * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
+ * may be dispatched to the bypass queue later due to uptodated managed
+ * folios. All file-backed folios related to this pcluster cannot be
+ * reused for in-place I/O (or bvpage) since the pcluster may be decoded
+ * in a separate queue (and thus out of order).
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
- * The pcluster was just linked to a decompression chain by us. It can
- * also be linked with the remaining pclusters, which means if the
- * processing page is the tail page of a pcluster, this pcluster can
- * safely use the whole page (since the previous pcluster is within the
- * same chain) for in-place I/O, as illustrated below:
- * ___________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (of the current pcl) | (of the previous pcl) |
- * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
- *
- * [ (*) the page above can be used as inplace I/O. ]
+ * The pcluster has just been linked to our processing chain.
+ * File-backed folios (except for the head page) related to it can be
+ * used for in-place I/O (or bvpage).
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
-struct z_erofs_decompress_frontend {
+struct z_erofs_frontend {
struct inode *const inode;
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
struct page *pagepool;
struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl;
- z_erofs_next_pcluster_t owned_head;
+ struct z_erofs_pcluster *pcl, *head;
enum z_erofs_pclustermode mode;
erofs_off_t headoffset;
@@ -534,11 +487,11 @@ struct z_erofs_decompress_frontend {
unsigned int icur;
};
-#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED }
+#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
+ .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
-static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
{
unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
@@ -555,10 +508,11 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
bool standalone = true;
/*
@@ -569,42 +523,40 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
- for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page;
- void *t; /* mark pages just found for debugging */
- struct page *newpage = NULL;
+ for (i = 0; i < pclusterpages; ++i) {
+ struct page *page, *newpage;
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, pcl->obj.index + i);
-
- if (page) {
- t = (void *)((unsigned long)page | 1);
- } else {
+ page = find_get_page(mc, pcl->index + i);
+ if (!page) {
/* I/O is needed, no possible to decompress directly */
standalone = false;
if (!shouldalloc)
continue;
/*
- * try to use cached I/O if page allocation
- * succeeds or fallback to in-place I/O instead
- * to avoid any direct reclaim.
+ * Try cached I/O if allocation succeeds or fallback to
+ * in-place I/O instead to avoid any direct reclaim.
*/
newpage = erofs_allocpage(&fe->pagepool, gfp);
if (!newpage)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
- t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = page ? page : newpage;
+ spin_unlock(&pcl->lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->lockref.lock);
if (page)
put_page(page);
@@ -620,36 +572,29 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
-/* called by erofs_shrinker to get rid of all compressed_pages */
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+/* (erofs_shrinker) disconnect cached encoded data with pclusters */
+static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ struct folio *folio;
int i;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- /*
- * refcount of workgroup is now freezed as 0,
- * therefore no need to worry about available decompression users.
- */
- for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page = pcl->compressed_bvecs[i].page;
-
- if (!page)
- continue;
-
- /* block other users from reclaiming or migrating the page */
- if (!trylock_page(page))
- return -EBUSY;
-
- if (!erofs_page_is_managed(sbi, page))
- continue;
+ /* Each cached folio contains one page unless bs > ps is supported */
+ for (i = 0; i < pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].page) {
+ folio = page_folio(pcl->compressed_bvecs[i].page);
+ /* Avoid reclaiming or migrating this folio */
+ if (!folio_trylock(folio))
+ return -EBUSY;
- /* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- detach_page_private(page);
- unlock_page(page);
+ if (!erofs_folio_is_managed(sbi, folio))
+ continue;
+ pcl->compressed_bvecs[i].page = NULL;
+ folio_detach_private(folio);
+ folio_unlock(folio);
+ }
}
return 0;
}
@@ -657,29 +602,27 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
struct z_erofs_pcluster *pcl = folio_get_private(folio);
+ struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
+ struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
bool ret;
- int i;
if (!folio_test_private(folio))
return true;
ret = false;
- spin_lock(&pcl->obj.lockref.lock);
- if (pcl->obj.lockref.count > 0)
- goto out;
-
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pcl->pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].page == &folio->page) {
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- ret = true;
- break;
+ spin_lock(&pcl->lockref.lock);
+ if (pcl->lockref.count <= 0) {
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (; bvec < end; ++bvec) {
+ if (bvec->page && page_folio(bvec->page) == folio) {
+ bvec->page = NULL;
+ folio_detach_private(folio);
+ ret = true;
+ break;
+ }
}
}
- if (ret)
- folio_detach_private(folio);
-out:
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return ret;
}
@@ -697,7 +640,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ while (!z_erofs_cache_release_folio(folio, 0))
cond_resched();
}
@@ -706,46 +649,40 @@ static const struct address_space_operations z_erofs_cache_aops = {
.invalidate_folio = z_erofs_cache_invalidate_folio,
};
-int erofs_init_managed_cache(struct super_block *sb)
+int z_erofs_init_super(struct super_block *sb)
{
struct inode *const inode = new_inode(sb);
if (!inode)
return -ENOMEM;
-
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
+ xa_init(&EROFS_SB(sb)->managed_pslots);
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
-static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+static int z_erofs_attach_page(struct z_erofs_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -757,52 +694,49 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
+static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *pcl = f->pcl;
- z_erofs_next_pcluster_t *owned_head = &f->owned_head;
-
- /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
- *owned_head) == Z_EROFS_PCLUSTER_NIL) {
- *owned_head = &pcl->next;
- /* so we can attach this pcluster to our submission chain. */
- f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
- return;
+ if (lockref_get_not_zero(&pcl->lockref))
+ return true;
+
+ spin_lock(&pcl->lockref.lock);
+ if (__lockref_is_dead(&pcl->lockref)) {
+ spin_unlock(&pcl->lockref.lock);
+ return false;
}
- /* type 2, it belongs to an ongoing chain */
- f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+ if (!pcl->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&pcl->lockref.lock);
+ return true;
}
-static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
+ struct super_block *sb = fe->inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
bool ztailpacking = map->m_flags & EROFS_MAP_META;
- struct z_erofs_pcluster *pcl;
- struct erofs_workgroup *grp;
+ struct z_erofs_pcluster *pcl, *pre;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
- (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
+ (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
/* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
- map->m_plen >> PAGE_SHIFT);
+ pcl = z_erofs_alloc_pcluster(map->m_plen);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- spin_lock_init(&pcl->obj.lockref.lock);
- pcl->obj.lockref.count = 1; /* one ref for this request */
+ spin_lock_init(&pcl->lockref.lock);
+ pcl->lockref.count = 1; /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
-
- /* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = fe->owned_head;
+ pcl->next = fe->head;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
@@ -814,26 +748,31 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
- pcl->obj.index = 0; /* which indicates ztailpacking */
- pcl->tailpacking_size = map->m_plen;
+ pcl->index = 0; /* which indicates ztailpacking */
} else {
- pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
- grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
+ pcl->index = erofs_blknr(sb, map->m_pa);
+ while (1) {
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
+ NULL, pcl, GFP_KERNEL);
+ if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
+ xa_unlock(&sbi->managed_pslots);
+ break;
+ }
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
}
-
- if (grp != &pcl->obj) {
- fe->pcl = container_of(grp,
- struct z_erofs_pcluster, obj);
+ if (xa_is_err(pre)) {
+ err = xa_err(pre);
+ goto err_out;
+ } else if (pre) {
+ fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
}
- fe->owned_head = &pcl->next;
- fe->pcl = pcl;
+ fe->head = fe->pcl = pcl;
return 0;
err_out:
@@ -842,28 +781,36 @@ err_out:
return err;
}
-static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
- struct erofs_workgroup *grp = NULL;
+ struct z_erofs_pcluster *pcl = NULL;
int ret;
DBG_BUGON(fe->pcl);
-
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(!fe->head);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(sb, blknr);
+ while (1) {
+ rcu_read_lock();
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
+ if (!pcl || z_erofs_get_pcluster(pcl)) {
+ DBG_BUGON(pcl && blknr != pcl->index);
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ }
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- if (grp) {
- fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl) {
+ fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@@ -871,7 +818,14 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- z_erofs_try_to_claim_pcluster(fe);
+ /* check if this pcluster hasn't been linked into any chain. */
+ if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
+ /* .. so it can be attached to our submission chain */
+ fe->head = fe->pcl;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+ } else { /* otherwise, it belongs to an inflight chain */
+ fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+ }
} else if (ret) {
return ret;
}
@@ -884,7 +838,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
} else {
void *mptr;
- mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+ mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
if (IS_ERR(mptr)) {
ret = PTR_ERR(mptr);
erofs_err(sb, "failed to get inline data %d", ret);
@@ -900,25 +854,93 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
return 0;
}
-/*
- * keep in mind that no referenced pclusters will be freed
- * only after a RCU grace period.
- */
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- z_erofs_free_pcluster(container_of(head,
- struct z_erofs_pcluster, rcu));
+ z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
+}
+
+static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ if (pcl->lockref.count)
+ return false;
+
+ /*
+ * Note that all cached folios should be detached before deleted from
+ * the XArray. Otherwise some folios could be still attached to the
+ * orphan old pcluster when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_folios(sbi, pcl))
+ return false;
+
+ /*
+ * It's impossible to fail after the pcluster is freezed, but in order
+ * to avoid some race conditions, add a DBG_BUGON to observe this.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
+
+ lockref_mark_dead(&pcl->lockref);
+ return true;
+}
+
+static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ bool free;
+
+ spin_lock(&pcl->lockref.lock);
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ spin_unlock(&pcl->lockref.lock);
+ if (free) {
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ }
+ return free;
+}
+
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
+{
+ struct z_erofs_pcluster *pcl;
+ unsigned long index, freed = 0;
+
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, pcl) {
+ /* try to shrink each valid pcluster */
+ if (!erofs_try_to_release_pcluster(sbi, pcl))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
}
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl, bool try_free)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ bool free = false;
- call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ if (lockref_put_or_lock(&pcl->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&pcl->lockref));
+ if (!--pcl->lockref.count) {
+ if (try_free && xa_trylock(&sbi->managed_pslots)) {
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ xa_unlock(&sbi->managed_pslots);
+ }
+ atomic_long_add(!free, &erofs_global_shrink_cnt);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ if (free)
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
-static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -931,17 +953,13 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
if (fe->candidate_bvpage)
fe->candidate_bvpage = NULL;
- /*
- * if all pending pages are added, don't hold its reference
- * any longer if the pcluster isn't hosted by ourselves.
- */
+ /* Drop refcount if it doesn't belong to our processing chain */
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
- erofs_workgroup_put(&pcl->obj);
-
+ z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
}
-static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
unsigned int cur, unsigned int end, erofs_off_t pos)
{
struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
@@ -952,115 +970,110 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
if (!packed_inode)
return -EFSCORRUPTED;
- buf.inode = packed_inode;
+ buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
- cnt = min_t(unsigned int, end - cur,
- sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
+ cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
+ src = erofs_bread(&buf, pos, EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
- memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
+ memcpy_to_folio(folio, cur, src, cnt);
}
erofs_put_metabuf(&buf);
return 0;
}
-static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+static int z_erofs_scan_folio(struct z_erofs_frontend *f,
+ struct folio *folio, bool ra)
{
- struct inode *const inode = fe->inode;
- struct erofs_map_blocks *const map = &fe->map;
- const loff_t offset = page_offset(page);
- bool tight = true, exclusive;
- unsigned int cur, end, len, split;
+ struct inode *const inode = f->inode;
+ struct erofs_map_blocks *const map = &f->map;
+ const loff_t offset = folio_pos(folio);
+ const unsigned int bs = i_blocksize(inode);
+ unsigned int end = folio_size(folio), split = 0, cur, pgs;
+ bool tight, excl;
int err = 0;
- z_erofs_onlinepage_init(page);
-
- split = 0;
- end = PAGE_SIZE;
-repeat:
- if (offset + end - 1 < map->m_la ||
- offset + end - 1 >= map->m_la + map->m_llen) {
- z_erofs_pcluster_end(fe);
- map->m_la = offset + end - 1;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto out;
- }
-
- cur = offset > map->m_la ? 0 : map->m_la - offset;
- /* bump split parts first to avoid several separate cases */
- ++split;
-
- if (!(map->m_flags & EROFS_MAP_MAPPED)) {
- zero_user_segment(page, cur, end);
- tight = false;
- goto next_part;
- }
-
- if (map->m_flags & EROFS_MAP_FRAGMENT) {
- erofs_off_t fpos = offset + cur - map->m_la;
-
- len = min_t(unsigned int, map->m_llen - fpos, end - cur);
- err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
- EROFS_I(inode)->z_fragmentoff + fpos);
- if (err)
- goto out;
- tight = false;
- goto next_part;
- }
+ tight = (bs == PAGE_SIZE);
+ erofs_onlinefolio_init(folio);
+ do {
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(f);
+ map->m_la = offset + end - 1;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ break;
+ }
- if (!fe->pcl) {
- err = z_erofs_pcluster_begin(fe);
- if (err)
- goto out;
- }
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ pgs = round_down(cur, PAGE_SIZE);
+ /* bump split parts first to avoid several separate cases */
+ ++split;
+
+ if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, end);
+ tight = false;
+ } else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
+ erofs_off_t fpos = offset + cur - map->m_la;
+
+ err = z_erofs_read_fragment(inode->i_sb, folio, cur,
+ cur + min(map->m_llen - fpos, end - cur),
+ EROFS_I(inode)->z_fragmentoff + fpos);
+ if (err)
+ break;
+ tight = false;
+ } else {
+ if (!f->pcl) {
+ err = z_erofs_pcluster_begin(f);
+ if (err)
+ break;
+ f->pcl->besteffort |= !ra;
+ }
- /*
- * Ensure the current partial page belongs to this submit chain rather
- * than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or bvpage (should be processed in a strict order.)
- */
- tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- exclusive = (!cur && ((split <= 1) || tight));
- if (cur)
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
-
- err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
- .page = page,
- .offset = offset - map->m_la,
- .end = end,
- }), exclusive);
- if (err)
- goto out;
-
- z_erofs_onlinepage_split(page);
- if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- fe->pcl->multibases = true;
- if (fe->pcl->length < offset + end - map->m_la) {
- fe->pcl->length = offset + end - map->m_la;
- fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
- }
- if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
- !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
- fe->pcl->length == map->m_llen)
- fe->pcl->partial = false;
-next_part:
- /* shorten the remaining extent to update progress */
- map->m_llen = offset + cur - map->m_la;
- map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
-
- end = cur;
- if (end > 0)
- goto repeat;
+ pgs = round_down(end - 1, PAGE_SIZE);
+ /*
+ * Ensure this partial page belongs to this submit chain
+ * rather than other concurrent submit chains or
+ * noio(bypass) chains since those chains are handled
+ * asynchronously thus it cannot be used for inplace I/O
+ * or bvpage (should be processed in the strict order.)
+ */
+ tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
+ excl = false;
+ if (cur <= pgs) {
+ excl = (split <= 1) || tight;
+ cur = pgs;
+ }
-out:
- z_erofs_onlinepage_endio(page, err);
+ err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
+ .page = folio_page(folio, pgs >> PAGE_SHIFT),
+ .offset = offset + pgs - map->m_la,
+ .end = end - pgs, }), excl);
+ if (err)
+ break;
+
+ erofs_onlinefolio_split(folio);
+ if (f->pcl->length < offset + end - map->m_la) {
+ f->pcl->length = offset + end - map->m_la;
+ f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ f->pcl->length == map->m_llen)
+ f->pcl->partial = false;
+ }
+ /* shorten the remaining extent to update progress */
+ map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
+ if (cur <= pgs) {
+ split = cur < pgs;
+ tight = (bs == PAGE_SIZE);
+ }
+ } while ((end = cur) > 0);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
@@ -1081,14 +1094,13 @@ static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
static bool z_erofs_page_is_invalidated(struct page *page)
{
- return !page->mapping && !z_erofs_is_shortlived_page(page);
+ return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
}
-struct z_erofs_decompress_backend {
+struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
struct z_erofs_pcluster *pcl;
-
/* pages with the longest decompressed length for deduplication */
struct page **decompressed_pages;
/* pages to keep the compressed data */
@@ -1097,6 +1109,8 @@ struct z_erofs_decompress_backend {
struct list_head decompressed_secondary_bvecs;
struct page **pagepool;
unsigned int onstack_used, nr_pages;
+ /* indicate if temporary copies should be preserved for later use */
+ bool keepxcpy;
};
struct z_erofs_bvec_item {
@@ -1104,21 +1118,23 @@ struct z_erofs_bvec_item {
struct list_head list;
};
-static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
struct z_erofs_bvec *bvec)
{
+ int poff = bvec->offset + be->pcl->pageofs_out;
struct z_erofs_bvec_item *item;
- unsigned int pgnr;
-
- if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
- (bvec->end == PAGE_SIZE ||
- bvec->offset + bvec->end == be->pcl->length)) {
- pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
- DBG_BUGON(pgnr >= be->nr_pages);
- if (!be->decompressed_pages[pgnr]) {
- be->decompressed_pages[pgnr] = bvec->page;
+ struct page **page;
+
+ if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
+ DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages);
+ page = be->decompressed_pages + (poff >> PAGE_SHIFT);
+ if (!*page) {
+ *page = bvec->page;
return;
}
+ } else {
+ be->keepxcpy = true;
}
/* (cold path) one pcluster is requested multiple times */
@@ -1127,8 +1143,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
list_add(&item->list, &be->decompressed_secondary_bvecs);
}
-static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
- int err)
+static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
{
unsigned int off0 = be->pcl->pageofs_out;
struct list_head *p, *n;
@@ -1163,13 +1178,13 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- z_erofs_onlinepage_endio(bvi->bvec.page, err);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
list_del(p);
kfree(bvi);
}
}
-static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
{
struct z_erofs_pcluster *pcl = be->pcl;
struct z_erofs_bvec_iter biter;
@@ -1194,8 +1209,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
}
-static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
- bool *overlapped)
+static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
{
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
@@ -1206,48 +1220,41 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
struct page *page = bvec->page;
- /* compressed pages ought to be present before decompressing */
- if (!page) {
- DBG_BUGON(1);
+ /* compressed data ought to be valid when decompressing */
+ if (IS_ERR(page) || !page) {
+ bvec->page = NULL; /* clear the failure reason */
+ err = page ? PTR_ERR(page) : -EIO;
continue;
}
be->compressed_pages[i] = page;
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (z_erofs_is_inline_pcluster(pcl) ||
+ erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
continue;
}
DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (!z_erofs_is_shortlived_page(page)) {
- if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
- if (!PageUptodate(page))
- err = -EIO;
- continue;
- }
- z_erofs_do_decompressed_bvec(be, bvec);
- *overlapped = true;
- }
+ if (z_erofs_is_shortlived_page(page))
+ continue;
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
}
-
- if (err)
- return err;
- return 0;
+ return err;
}
-static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
- int err)
+static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
{
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
- const struct z_erofs_decompressor *decompressor =
- &erofs_decompressors[pcl->algorithmformat];
- unsigned int i, inputsize;
- int err2;
+ const struct z_erofs_decompressor *decomp =
+ z_erofs_decomp[pcl->algorithmformat];
+ int i, j, jtop, err2;
struct page *page;
bool overlapped;
+ bool try_free = true;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1279,41 +1286,38 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
err2 = z_erofs_parse_in_bvecs(be, &overlapped);
if (err2)
err = err2;
- if (err)
- goto out;
-
- if (z_erofs_is_inline_pcluster(pcl))
- inputsize = pcl->tailpacking_size;
- else
- inputsize = pclusterpages * PAGE_SIZE;
-
- err = decompressor->decompress(&(struct z_erofs_decompress_req) {
+ if (!err)
+ err = decomp->decompress(&(struct z_erofs_decompress_req) {
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
- .inputsize = inputsize,
+ .inputsize = pcl->pclustersize,
.outputsize = pcl->length,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
- .fillgaps = pcl->multibases,
+ .fillgaps = be->keepxcpy,
+ .gfp = pcl->besteffort ? GFP_KERNEL :
+ GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
-out:
/* must handle all compressed pages before actual file pages */
if (z_erofs_is_inline_pcluster(pcl)) {
page = pcl->compressed_bvecs[0].page;
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
} else {
+ /* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
- /* consider shortlived pages added when decompressing */
page = be->compressed_pages[i];
-
- if (erofs_page_is_managed(sbi, page))
+ if (!page)
continue;
+ if (erofs_folio_is_managed(sbi, page_folio(page))) {
+ try_free = false;
+ continue;
+ }
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@@ -1321,59 +1325,70 @@ out:
if (be->compressed_pages < be->onstack_pages ||
be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
kvfree(be->compressed_pages);
- z_erofs_fill_other_copies(be, err);
+ jtop = 0;
+ z_erofs_fill_other_copies(be, err);
for (i = 0; i < be->nr_pages; ++i) {
page = be->decompressed_pages[i];
if (!page)
continue;
DBG_BUGON(z_erofs_page_is_invalidated(page));
-
- /* recycle all individual short-lived pages */
- if (z_erofs_put_shortlivedpage(be->pagepool, page))
+ if (!z_erofs_is_shortlived_page(page)) {
+ erofs_onlinefolio_end(page_folio(page), err, true);
continue;
- z_erofs_onlinepage_endio(page, err);
+ }
+ if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
+ erofs_pagepool_add(be->pagepool, page);
+ continue;
+ }
+ for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
+ ;
+ if (j >= jtop) /* this bounce page is newly detected */
+ be->decompressed_pages[jtop++] = page;
}
-
+ while (jtop)
+ erofs_pagepool_add(be->pagepool,
+ be->decompressed_pages[--jtop]);
if (be->decompressed_pages != be->onstack_pages)
kvfree(be->decompressed_pages);
pcl->length = 0;
pcl->partial = true;
- pcl->multibases = false;
+ pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
/* pcluster lock MUST be taken before the following line */
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+ WRITE_ONCE(pcl->next, NULL);
mutex_unlock(&pcl->lock);
+
+ if (z_erofs_is_inline_pcluster(pcl))
+ z_erofs_free_pcluster(pcl);
+ else
+ z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
-static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct page **pagepool)
+static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct page **pagepool)
{
- struct z_erofs_decompress_backend be = {
+ struct z_erofs_backend be = {
.sb = io->sb,
.pagepool = pagepool,
.decompressed_secondary_bvecs =
LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ .pcl = io->head,
};
- z_erofs_next_pcluster_t owned = io->head;
+ struct z_erofs_pcluster *next;
+ int err = io->eio ? -EIO : 0;
- while (owned != Z_EROFS_PCLUSTER_TAIL) {
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
-
- be.pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(be.pcl->next);
-
- z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
- if (z_erofs_is_inline_pcluster(be.pcl))
- z_erofs_free_pcluster(be.pcl);
- else
- erofs_workgroup_put(&be.pcl->obj);
+ for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
+ DBG_BUGON(!be.pcl);
+ next = READ_ONCE(be.pcl->next);
+ err = z_erofs_decompress_pcluster(&be, err) ?: err;
}
+ return err;
}
static void z_erofs_decompressqueue_work(struct work_struct *work)
@@ -1435,113 +1450,112 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
z_erofs_decompressqueue_work(&io->u.work);
}
-static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
- unsigned int nr,
- struct page **pagepool,
- struct address_space *mc)
+static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
+ struct z_erofs_frontend *f,
+ struct z_erofs_pcluster *pcl,
+ unsigned int nr,
+ struct address_space *mc)
{
- const pgoff_t index = pcl->obj.index;
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
-
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *oldpage, *page;
- int justfound;
+ struct folio *folio;
+ struct page *page;
+ int bs = i_blocksize(f->inode);
+ /* Except for inplace folios, the entire folio can be used for I/Os */
+ bvec->bv_offset = 0;
+ bvec->bv_len = PAGE_SIZE;
repeat:
- page = READ_ONCE(pcl->compressed_bvecs[nr].page);
- oldpage = page;
-
- if (!page)
- goto out_allocpage;
+ spin_lock(&pcl->lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ spin_unlock(&pcl->lockref.lock);
+ if (!zbv.page)
+ goto out_allocfolio;
- justfound = (unsigned long)page & 1UL;
- page = (struct page *)((unsigned long)page & ~1UL);
+ bvec->bv_page = zbv.page;
+ DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
+ folio = page_folio(zbv.page);
/*
- * preallocated cached pages, which is used to avoid direct reclaim
- * otherwise, it will go inplace I/O path instead.
+ * Handle preallocated cached folios. We tried to allocate such folios
+ * without triggering direct reclaim. If allocation failed, inplace
+ * file-backed folios will be used instead.
*/
- if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
- set_page_private(page, 0);
+ if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
tocache = true;
goto out_tocache;
}
- mapping = READ_ONCE(page->mapping);
+ mapping = READ_ONCE(folio->mapping);
/*
- * file-backed online pages in plcuster are all locked steady,
- * therefore it is impossible for `mapping' to be NULL.
+ * File-backed folios for inplace I/Os are all locked steady,
+ * therefore it is impossible for `mapping` to be NULL.
*/
- if (mapping && mapping != mc)
- /* ought to be unmanaged pages */
- goto out;
-
- /* directly return for shortlived page as well */
- if (z_erofs_is_shortlived_page(page))
- goto out;
-
- lock_page(page);
-
- /* only true if page reclaim goes wrong, should never happen */
- DBG_BUGON(justfound && PagePrivate(page));
-
- /* the page is still in manage cache */
- if (page->mapping == mc) {
- WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
-
- if (!PagePrivate(page)) {
- /*
- * impossible to be !PagePrivate(page) for
- * the current restriction as well if
- * the page is already in compressed_bvecs[].
- */
- DBG_BUGON(!justfound);
+ if (mapping && mapping != mc) {
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
+ return;
+ }
- justfound = 0;
- set_page_private(page, (unsigned long)pcl);
- SetPagePrivate(page);
+ folio_lock(folio);
+ if (likely(folio->mapping == mc)) {
+ /*
+ * The cached folio is still in managed cache but without
+ * a valid `->private` pcluster hint. Let's reconnect them.
+ */
+ if (!folio_test_private(folio)) {
+ folio_attach_private(folio, pcl);
+ /* compressed_bvecs[] already takes a ref before */
+ folio_put(folio);
}
-
- /* no need to submit io if it is already up-to-date */
- if (PageUptodate(page)) {
- unlock_page(page);
- page = NULL;
+ if (likely(folio->private == pcl)) {
+ /* don't submit cache I/Os again if already uptodate */
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ bvec->bv_page = NULL;
+ }
+ return;
}
- goto out;
+ /*
+ * Already linked with another pcluster, which only appears in
+ * crafted images by fuzzers for now. But handle this anyway.
+ */
+ tocache = false; /* use temporary short-lived pages */
+ } else {
+ DBG_BUGON(1); /* referenced managed folios can't be truncated */
+ tocache = true;
}
-
- /*
- * the managed page has been truncated, it's unsafe to
- * reuse this one, let's allocate a new cache-managed page.
- */
- DBG_BUGON(page->mapping);
- DBG_BUGON(!justfound);
-
- tocache = true;
- unlock_page(page);
- put_page(page);
-out_allocpage:
- page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
- oldpage, page)) {
- erofs_pagepool_add(pagepool, page);
+ folio_unlock(folio);
+ folio_put(folio);
+out_allocfolio:
+ page = __erofs_allocpage(&f->pagepool, gfp, true);
+ spin_lock(&pcl->lockref.lock);
+ if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+ if (page)
+ erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
+ spin_unlock(&pcl->lockref.lock);
+ bvec->bv_page = page;
+ if (!page)
+ return;
+ folio = page_folio(page);
out_tocache:
- if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
- /* turn into temporary page if fails (1 ref) */
- set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
- goto out;
+ if (!tocache || bs != PAGE_SIZE ||
+ filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
+ /* turn into a temporary shortlived folio (1 ref) */
+ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+ return;
}
- attach_page_private(page, pcl);
- /* drop a refcount added by allocpage (then we have 2 refs here) */
- put_page(page);
-
-out: /* the only exit (for tracing and debugging) */
- return page;
+ folio_attach_private(folio, pcl);
+ /* drop a refcount added by allocpage (then 2 refs in total here) */
+ folio_put(folio);
}
static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1581,67 +1595,58 @@ enum {
NR_JOBQUEUES,
};
-static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
- z_erofs_next_pcluster_t qtail[],
- z_erofs_next_pcluster_t owned_head)
+static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
+ struct z_erofs_pcluster *next,
+ struct z_erofs_pcluster **qtail[])
{
- z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
- z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
-
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
-
- WRITE_ONCE(*submit_qtail, owned_head);
- WRITE_ONCE(*bypass_qtail, &pcl->next);
-
+ WRITE_ONCE(*qtail[JQ_SUBMIT], next);
+ WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
qtail[JQ_BYPASS] = &pcl->next;
}
-static void z_erofs_decompressqueue_endio(struct bio *bio)
+static void z_erofs_endio(struct bio *bio)
{
struct z_erofs_decompressqueue *q = bio->bi_private;
blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(z_erofs_page_is_invalidated(page));
+ DBG_BUGON(folio_test_uptodate(folio));
+ DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
+ if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
+ continue;
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
+ if (!err)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
if (err)
q->eio = true;
z_erofs_decompress_kickoff(q, -1);
- bio_put(bio);
+ if (bio->bi_bdev)
+ bio_put(bio);
}
-static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
+static void z_erofs_submit_queue(struct z_erofs_frontend *f,
struct z_erofs_decompressqueue *fgq,
bool *force_fg, bool readahead)
{
struct super_block *sb = f->inode->i_sb;
struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
- z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+ struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
- z_erofs_next_pcluster_t owned_head = f->owned_head;
+ struct z_erofs_pcluster *pcl, *next;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
- pgoff_t last_index;
- struct block_device *last_bdev;
+ erofs_off_t last_pa;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
unsigned long pflags;
int memstall = 0;
- /*
- * if managed cache is enabled, bypass jobqueue is needed,
- * no need to read from device for all pclusters in this queue.
- */
+ /* No need to read from device for pclusters in the bypass queue. */
q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
@@ -1649,45 +1654,42 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
/* by default, all need io submission */
- q[JQ_SUBMIT]->head = owned_head;
+ q[JQ_SUBMIT]->head = next = f->head;
do {
struct erofs_map_dev mdev;
- struct z_erofs_pcluster *pcl;
- pgoff_t cur, end;
+ erofs_off_t cur, end;
+ struct bio_vec bvec;
unsigned int i = 0;
bool bypass = true;
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
- pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- owned_head = READ_ONCE(pcl->next);
-
+ pcl = next;
+ next = READ_ONCE(pcl->next);
if (z_erofs_is_inline_pcluster(pcl)) {
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
continue;
}
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->obj.index),
+ .m_pa = erofs_pos(sb, pcl->index),
};
(void)erofs_map_dev(sb, &mdev);
- cur = erofs_blknr(sb, mdev.m_pa);
- end = cur + pcl->pclusterpages;
-
+ cur = mdev.m_pa;
+ end = cur + pcl->pclustersize;
do {
- struct page *page;
+ bvec.bv_page = NULL;
+ if (bio && (cur != last_pa ||
+ bio->bi_bdev != mdev.m_bdev)) {
+drain_io:
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
+ erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
- page = pickup_page_for_submission(pcl, i++,
- &f->pagepool, mc);
- if (!page)
- continue;
-
- if (bio && (cur != last_index + 1 ||
- last_bdev != mdev.m_bdev)) {
-submit_bio_retry:
- submit_bio(bio);
if (memstall) {
psi_memstall_leave(&pflags);
memstall = 0;
@@ -1695,43 +1697,60 @@ submit_bio_retry:
bio = NULL;
}
- if (unlikely(PageWorkingset(page)) && !memstall) {
+ if (!bvec.bv_page) {
+ z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+ if (!bvec.bv_page)
+ continue;
+ if (cur + bvec.bv_len > end)
+ bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+ }
+
+ if (unlikely(PageWorkingset(bvec.bv_page)) &&
+ !memstall) {
psi_memstall_enter(&pflags);
memstall = 1;
}
if (!bio) {
- bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
- REQ_OP_READ, GFP_NOIO);
- bio->bi_end_io = z_erofs_decompressqueue_endio;
-
- last_bdev = mdev.m_bdev;
- bio->bi_iter.bi_sector = (sector_t)cur <<
- (sb->s_blocksize_bits - 9);
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ bio = erofs_fileio_bio_alloc(&mdev);
+ else if (erofs_is_fscache_mode(sb))
+ bio = erofs_fscache_bio_alloc(&mdev);
+ else
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
+ bio->bi_end_io = z_erofs_endio;
+ bio->bi_iter.bi_sector = cur >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
- goto submit_bio_retry;
-
- last_index = cur;
+ if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
+ bvec.bv_offset))
+ goto drain_io;
+ last_pa = cur + bvec.bv_len;
bypass = false;
- } while (++cur < end);
+ } while ((cur += bvec.bv_len) < end);
if (!bypass)
qtail[JQ_SUBMIT] = &pcl->next;
else
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
- } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
+ } while (next != Z_EROFS_PCLUSTER_TAIL);
if (bio) {
- submit_bio(bio);
- if (memstall)
- psi_memstall_leave(&pflags);
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
+ erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
}
+ if (memstall)
+ psi_memstall_leave(&pflags);
/*
* although background is preferred, no one is pending for submission.
@@ -1744,33 +1763,34 @@ submit_bio_retry:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
}
-static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- bool force_fg, bool ra)
+static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
+ struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
+ bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
+ int err;
- if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
- return;
- z_erofs_submit_queue(f, io, &force_fg, ra);
+ if (f->head == Z_EROFS_PCLUSTER_TAIL)
+ return 0;
+ z_erofs_submit_queue(f, io, &force_fg, !!rapages);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
-
+ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
- return;
+ return err;
/* wait until all bios are completed */
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
+ return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
}
/*
* Since partial uptodate is still unimplemented for now, we have to use
* approximate readmore strategies as a start.
*/
-static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
@@ -1798,7 +1818,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
-
if (!map->m_llen)
return;
}
@@ -1806,15 +1825,15 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
cur = map->m_la + map->m_llen - 1;
while ((cur >= end) && (cur < i_size_read(inode))) {
pgoff_t index = cur >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
- page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
- if (page) {
- if (PageUptodate(page))
- unlock_page(page);
+ folio = erofs_grab_folio_nowait(inode->i_mapping, index);
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_uptodate(folio))
+ folio_unlock(folio);
else
- (void)z_erofs_do_read_page(f, page);
- put_page(page);
+ z_erofs_scan_folio(f, folio, !!rac);
+ folio_put(folio);
}
if (cur < PAGE_SIZE)
@@ -1826,21 +1845,17 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
int err;
trace_erofs_read_folio(folio, false);
- f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
-
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_scan_folio(&f, folio, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
- /* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
-
+ /* if some pclusters are ready, need submit them anyway */
+ err = z_erofs_runqueue(&f, 0) ?: err;
if (err && err != -EINTR)
erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
err, folio->index, EROFS_I(inode)->nid);
@@ -1853,18 +1868,13 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
+ unsigned int nrpages = readahead_count(rac);
struct folio *head = NULL, *folio;
- unsigned int nr_folios;
int err;
- f.headoffset = readahead_pos(rac);
-
+ trace_erofs_readahead(inode, readahead_index(rac), nrpages, false);
z_erofs_pcluster_readmore(&f, rac, true);
- nr_folios = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
-
while ((folio = readahead_folio(rac))) {
folio->private = head;
head = folio;
@@ -1875,7 +1885,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_scan_folio(&f, folio, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
@@ -1883,7 +1893,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
z_erofs_pcluster_readmore(&f, rac, false);
z_erofs_pcluster_end(&f);
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
+ (void)z_erofs_runqueue(&f, nrpages);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 76566c2cbf63..25a4b82c183c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -4,14 +4,12 @@
* https://www.huawei.com/
*/
#include "internal.h"
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <trace/events/erofs.h>
struct z_erofs_maprecorder {
struct inode *inode;
struct erofs_map_blocks *map;
- void *kaddr;
-
unsigned long lcn;
/* compression extent information gathered */
u8 type, headtype;
@@ -31,22 +29,17 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
vi->inode_isize + vi->xattr_isize) +
lcn * sizeof(struct z_erofs_lcluster_index);
struct z_erofs_lcluster_index *di;
- unsigned int advise, type;
-
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
+ unsigned int advise;
- m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
m->lcn = lcn;
- di = m->kaddr + erofs_blkoff(inode->i_sb, pos);
+ m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
advise = le16_to_cpu(di->di_advise);
- type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) &
- ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1);
- switch (type) {
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
@@ -55,29 +48,19 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedblks = m->delta[0] &
- ~Z_EROFS_LI_D0_CBLKCNT;
+ m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT;
m->delta[0] = 1;
}
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
- break;
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
- if (advise & Z_EROFS_LI_PARTIAL_REF)
- m->partialref = true;
+ } else {
+ m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
m->pblk = le32_to_cpu(di->di_u.blkaddr);
- break;
- default:
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- m->type = type;
return 0;
}
@@ -114,17 +97,48 @@ static int get_compacted_la_distance(unsigned int lobits,
return d1;
}
-static int unpack_compacted_index(struct z_erofs_maprecorder *m,
- unsigned int amortizedshift,
- erofs_off_t pos, bool lookahead)
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn, bool lookahead)
{
- struct erofs_inode *const vi = EROFS_I(m->inode);
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
+ ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs;
- int i;
+ const unsigned int totalidx = erofs_iblks(inode);
+ unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
+ unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
+ bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ erofs_off_t pos;
u8 *in, type;
- bool big_pcluster;
+ int i;
+
+ if (lcn >= totalidx || lclusterbits > 14)
+ return -EINVAL;
+
+ m->lcn = lcn;
+ /* used to align to 32-byte (compacted_2b) alignment */
+ compacted_4b_initial = ((32 - ebase % 32) / 4) & 7;
+ compacted_2b = 0;
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
+ compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+
+ pos = ebase;
+ amortizedshift = 2; /* compact_4b */
+ if (lcn >= compacted_4b_initial) {
+ pos += compacted_4b_initial * 4;
+ lcn -= compacted_4b_initial;
+ if (lcn < compacted_2b) {
+ amortizedshift = 1;
+ } else {
+ pos += compacted_2b * 2;
+ lcn -= compacted_2b;
+ }
+ }
+ pos += lcn * (1 << amortizedshift);
+ /* figure out the lcluster count in this pack */
if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
else if (1 << amortizedshift == 2 && lclusterbits <= 12)
@@ -132,17 +146,18 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
+ in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP);
+ if (IS_ERR(in))
+ return PTR_ERR(in);
+
/* it doesn't equal to round_up(..) */
m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
(vcnt << amortizedshift);
- big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
- eofs = erofs_blkoff(m->inode->i_sb, pos);
- base = round_down(eofs, vcnt << amortizedshift);
- in = m->kaddr + base;
-
- i = (eofs - base) >> amortizedshift;
+ bytes = pos & ((vcnt << amortizedshift) - 1);
+ in -= bytes;
+ i = bytes >> amortizedshift;
lo = decode_compactedbits(lobits, in, encodebits * i, &type);
m->type = type;
@@ -222,57 +237,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
return 0;
}
-static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
- unsigned long lcn, bool lookahead)
-{
- struct inode *const inode = m->inode;
- struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
- ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- unsigned int totalidx = erofs_iblks(inode);
- unsigned int compacted_4b_initial, compacted_2b;
- unsigned int amortizedshift;
- erofs_off_t pos;
-
- if (lcn >= totalidx || vi->z_logical_clusterbits > 14)
- return -EINVAL;
-
- m->lcn = lcn;
- /* used to align to 32-byte (compacted_2b) alignment */
- compacted_4b_initial = (32 - ebase % 32) / 4;
- if (compacted_4b_initial == 32 / 4)
- compacted_4b_initial = 0;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
- compacted_4b_initial < totalidx)
- compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
- else
- compacted_2b = 0;
-
- pos = ebase;
- if (lcn < compacted_4b_initial) {
- amortizedshift = 2;
- goto out;
- }
- pos += compacted_4b_initial * 4;
- lcn -= compacted_4b_initial;
-
- if (lcn < compacted_2b) {
- amortizedshift = 1;
- goto out;
- }
- pos += compacted_2b * 2;
- lcn -= compacted_2b;
- amortizedshift = 2;
-out:
- pos += lcn * (1 << amortizedshift);
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
- return unpack_compacted_index(m, amortizedshift, pos, lookahead);
-}
-
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
@@ -301,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
if (err)
return err;
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
if (!lookback_distance)
- goto err_bogus;
+ break;
continue;
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ } else {
m->headtype = m->type;
m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
return 0;
- default:
- erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
}
-err_bogus:
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
DBG_BUGON(1);
@@ -330,27 +290,23 @@ err_bogus:
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
unsigned int initial_lcn)
{
- struct super_block *sb = m->inode->i_sb;
- struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn;
+ struct inode *inode = m->inode;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2;
+ unsigned long lcn = m->lcn + 1;
int err;
- DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 &&
- m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
DBG_BUGON(m->type != m->headtype);
- if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
- ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1ULL << lclusterbits;
- return 0;
- }
- lcn = m->lcn + 1;
+ if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
+ ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+ m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
+ (lcn << vi->z_logical_clusterbits) >= inode->i_size)
+ m->compressedblks = 1;
+
if (m->compressedblks)
goto out;
@@ -369,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
DBG_BUGON(lcn == initial_lcn &&
m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ if (m->compressedblks)
+ goto out;
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+ * rather than CBLKCNT, it's a 1 block-sized pcluster.
*/
- m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits);
- break;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
- if (m->delta[0] != 1)
- goto err_bonus_cblkcnt;
- if (m->compressedblks)
- break;
- fallthrough;
- default:
- erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn,
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ m->compressedblks = 1;
+ goto out;
}
-out:
- map->m_plen = erofs_pos(sb, m->compressedblks);
- return 0;
-err_bonus_cblkcnt:
- erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
+out:
+ m->map->m_plen = erofs_pos(sb, m->compressedblks);
+ return 0;
}
static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
@@ -426,9 +375,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
m->delta[1] = 1;
DBG_BUGON(1);
}
- } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
if (lcn != headlcn)
break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
@@ -447,9 +394,10 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
static int z_erofs_do_map_blocks(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
- struct erofs_inode *const vi = EROFS_I(inode);
- bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ bool ztailpacking = vi->z_idata_size;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
@@ -468,9 +416,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (err)
goto unmap_out;
- if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
- vi->z_idataoff = m.nextpackoff;
-
+ if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking)
+ vi->z_fragmentoff = m.nextpackoff;
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
@@ -492,8 +439,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
/* m.lcn should be >= 1 if endoff < m.clusterofs */
if (!m.lcn) {
- erofs_err(inode->i_sb,
- "invalid logical cluster 0 at nid %llu",
+ erofs_err(sb, "invalid logical cluster 0 at nid %llu",
vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -509,8 +455,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
goto unmap_out;
break;
default:
- erofs_err(inode->i_sb,
- "unknown type %u @ offset %llu of nid %llu",
+ erofs_err(sb, "unknown type %u @ offset %llu of nid %llu",
m.type, ofs, vi->nid);
err = -EOPNOTSUPP;
goto unmap_out;
@@ -527,12 +472,18 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
- map->m_pa = vi->z_idataoff;
+ map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
+ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map->m_plen);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
- map->m_flags |= EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
} else {
- map->m_pa = erofs_pos(inode->i_sb, m.pblk);
+ map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto unmap_out;
@@ -551,7 +502,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
afmt, vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -562,7 +513,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
(map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
- map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) &&
map->m_llen >= i_blocksize(inode))) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
@@ -581,7 +533,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
int err, headnr;
erofs_off_t pos;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
struct z_erofs_map_header *h;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
@@ -601,13 +552,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
+ if (IS_ERR(h)) {
+ err = PTR_ERR(h);
goto out_unlock;
}
- h = kaddr + erofs_blkoff(sb, pos);
/*
* if the highest bit of the 8-byte map header is set, the whole file
* is stored in the packed inode. The rest bits keeps z_fragmentoff.
@@ -621,6 +571,10 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
vi->z_advise = le16_to_cpu(h->h_advise);
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
headnr = 0;
if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
@@ -649,33 +603,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_put_metabuf;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER
- };
-
- vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- err = z_erofs_do_map_blocks(inode, &map,
- EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
-
- if (!map.m_plen ||
- erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map.m_plen);
- err = -EFSCORRUPTED;
- }
- if (err < 0)
- goto out_put_metabuf;
- }
-
- if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
- !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
+ if (vi->z_idata_size ||
+ (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
struct erofs_map_blocks map = {
.buf = __EROFS_BUF_INITIALIZER
};
- vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
err = z_erofs_do_map_blocks(inode, &map,
EROFS_GET_BLOCKS_FINDTAIL);
erofs_put_metabuf(&map.buf);
@@ -699,34 +632,31 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
struct erofs_inode *const vi = EROFS_I(inode);
int err = 0;
- trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
-
- /* when trying to read beyond EOF, leave it unmapped */
- if (map->m_la >= inode->i_size) {
+ trace_erofs_map_blocks_enter(inode, map, flags);
+ if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */
map->m_llen = map->m_la + 1 - inode->i_size;
map->m_la = inode->i_size;
map->m_flags = 0;
- goto out;
- }
-
- err = z_erofs_fill_inode_lazy(inode);
- if (err)
- goto out;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
- EROFS_MAP_FRAGMENT;
- goto out;
+ } else {
+ err = z_erofs_fill_inode_lazy(inode);
+ if (!err) {
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_FRAGMENT;
+ } else {
+ err = z_erofs_do_map_blocks(inode, map, flags);
+ }
+ }
+ if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
+ unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ err = -EOPNOTSUPP;
+ if (err)
+ map->m_llen = 0;
}
-
- err = z_erofs_do_map_blocks(inode, map, flags);
-out:
- if (err)
- map->m_llen = 0;
- trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+ trace_erofs_map_blocks_exit(inode, map, flags, err);
return err;
}
@@ -747,7 +677,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ?
IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
new file mode 100644
index 000000000000..0dd65cefce33
--- /dev/null
+++ b/fs/erofs/zutil.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
+ */
+#include "internal.h"
+
+struct z_erofs_gbuf {
+ spinlock_t lock;
+ void *ptr;
+ struct page **pages;
+ unsigned int nrpages;
+};
+
+static struct z_erofs_gbuf *z_erofs_gbufpool, *z_erofs_rsvbuf;
+static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
+ z_erofs_rsv_nrpages;
+
+module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
+module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
+
+atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
+
+/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+static unsigned int shrinker_run_no;
+static struct shrinker *erofs_shrinker_info;
+
+static unsigned int z_erofs_gbuf_id(void)
+{
+ return raw_smp_processor_id() % z_erofs_gbuf_count;
+}
+
+void *z_erofs_get_gbuf(unsigned int requiredpages)
+ __acquires(gbuf->lock)
+{
+ struct z_erofs_gbuf *gbuf;
+
+ migrate_disable();
+ gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+ spin_lock(&gbuf->lock);
+ /* check if the buffer is too small */
+ if (requiredpages > gbuf->nrpages) {
+ spin_unlock(&gbuf->lock);
+ migrate_enable();
+ /* (for sparse checker) pretend gbuf->lock is still taken */
+ __acquire(gbuf->lock);
+ return NULL;
+ }
+ return gbuf->ptr;
+}
+
+void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
+{
+ struct z_erofs_gbuf *gbuf;
+
+ gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+ DBG_BUGON(gbuf->ptr != ptr);
+ spin_unlock(&gbuf->lock);
+ migrate_enable();
+}
+
+int z_erofs_gbuf_growsize(unsigned int nrpages)
+{
+ static DEFINE_MUTEX(gbuf_resize_mutex);
+ struct page **tmp_pages = NULL;
+ struct z_erofs_gbuf *gbuf;
+ void *ptr, *old_ptr;
+ int last, i, j;
+
+ mutex_lock(&gbuf_resize_mutex);
+ /* avoid shrinking gbufs, since no idea how many fses rely on */
+ if (nrpages <= z_erofs_gbuf_nrpages) {
+ mutex_unlock(&gbuf_resize_mutex);
+ return 0;
+ }
+
+ for (i = 0; i < z_erofs_gbuf_count; ++i) {
+ gbuf = &z_erofs_gbufpool[i];
+ tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL);
+ if (!tmp_pages)
+ goto out;
+
+ for (j = 0; j < gbuf->nrpages; ++j)
+ tmp_pages[j] = gbuf->pages[j];
+ do {
+ last = j;
+ j = alloc_pages_bulk_array(GFP_KERNEL, nrpages,
+ tmp_pages);
+ if (last == j)
+ goto out;
+ } while (j != nrpages);
+
+ ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
+ if (!ptr)
+ goto out;
+
+ spin_lock(&gbuf->lock);
+ kfree(gbuf->pages);
+ gbuf->pages = tmp_pages;
+ old_ptr = gbuf->ptr;
+ gbuf->ptr = ptr;
+ gbuf->nrpages = nrpages;
+ spin_unlock(&gbuf->lock);
+ if (old_ptr)
+ vunmap(old_ptr);
+ }
+ z_erofs_gbuf_nrpages = nrpages;
+out:
+ if (i < z_erofs_gbuf_count && tmp_pages) {
+ for (j = 0; j < nrpages; ++j)
+ if (tmp_pages[j] && (j >= gbuf->nrpages ||
+ tmp_pages[j] != gbuf->pages[j]))
+ __free_page(tmp_pages[j]);
+ kfree(tmp_pages);
+ }
+ mutex_unlock(&gbuf_resize_mutex);
+ return i < z_erofs_gbuf_count ? -ENOMEM : 0;
+}
+
+int __init z_erofs_gbuf_init(void)
+{
+ unsigned int i, total = num_possible_cpus();
+
+ if (z_erofs_gbuf_count)
+ total = min(z_erofs_gbuf_count, total);
+ z_erofs_gbuf_count = total;
+
+ /* The last (special) global buffer is the reserved buffer */
+ total += !!z_erofs_rsv_nrpages;
+
+ z_erofs_gbufpool = kcalloc(total, sizeof(*z_erofs_gbufpool),
+ GFP_KERNEL);
+ if (!z_erofs_gbufpool)
+ return -ENOMEM;
+
+ if (z_erofs_rsv_nrpages) {
+ z_erofs_rsvbuf = &z_erofs_gbufpool[total - 1];
+ z_erofs_rsvbuf->pages = kcalloc(z_erofs_rsv_nrpages,
+ sizeof(*z_erofs_rsvbuf->pages), GFP_KERNEL);
+ if (!z_erofs_rsvbuf->pages) {
+ z_erofs_rsvbuf = NULL;
+ z_erofs_rsv_nrpages = 0;
+ }
+ }
+ for (i = 0; i < total; ++i)
+ spin_lock_init(&z_erofs_gbufpool[i].lock);
+ return 0;
+}
+
+void z_erofs_gbuf_exit(void)
+{
+ int i, j;
+
+ for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
+ struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
+
+ if (gbuf->ptr) {
+ vunmap(gbuf->ptr);
+ gbuf->ptr = NULL;
+ }
+
+ if (!gbuf->pages)
+ continue;
+
+ for (j = 0; j < gbuf->nrpages; ++j)
+ if (gbuf->pages[j])
+ put_page(gbuf->pages[j]);
+ kfree(gbuf->pages);
+ gbuf->pages = NULL;
+ }
+ kfree(z_erofs_gbufpool);
+}
+
+struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv)
+{
+ struct page *page = *pagepool;
+
+ if (page) {
+ *pagepool = (struct page *)page_private(page);
+ } else if (tryrsv && z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages) {
+ spin_lock(&z_erofs_rsvbuf->lock);
+ if (z_erofs_rsvbuf->nrpages)
+ page = z_erofs_rsvbuf->pages[--z_erofs_rsvbuf->nrpages];
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ }
+ if (!page)
+ page = alloc_page(gfp);
+ DBG_BUGON(page && page_ref_count(page) != 1);
+ return page;
+}
+
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ /* try to fill reserved global pool first */
+ if (z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages <
+ z_erofs_rsv_nrpages) {
+ spin_lock(&z_erofs_rsvbuf->lock);
+ if (z_erofs_rsvbuf->nrpages < z_erofs_rsv_nrpages) {
+ z_erofs_rsvbuf->pages[z_erofs_rsvbuf->nrpages++]
+ = page;
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ continue;
+ }
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ }
+ put_page(page);
+ }
+}
+
+void erofs_shrinker_register(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_init(&sbi->umount_mutex);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_add(&sbi->list, &erofs_sb_list);
+ spin_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ mutex_lock(&sbi->umount_mutex);
+ while (!xa_empty(&sbi->managed_pslots)) {
+ z_erofs_shrink_scan(sbi, ~0UL);
+ cond_resched();
+ }
+ spin_lock(&erofs_sb_list_lock);
+ list_del(&sbi->list);
+ spin_unlock(&erofs_sb_list_lock);
+ mutex_unlock(&sbi->umount_mutex);
+}
+
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct erofs_sb_info *sbi;
+ struct list_head *p;
+
+ unsigned long nr = sc->nr_to_scan;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&erofs_sb_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+
+ /* Iterate over all mounted superblocks and try to shrink them */
+ p = erofs_sb_list.next;
+ while (p != &erofs_sb_list) {
+ sbi = list_entry(p, struct erofs_sb_info, list);
+
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+
+ spin_unlock(&erofs_sb_list_lock);
+ sbi->shrinker_run_no = run_no;
+ freed += z_erofs_shrink_scan(sbi, nr - freed);
+ spin_lock(&erofs_sb_list_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_move_tail(&sbi->list, &erofs_sb_list);
+ mutex_unlock(&sbi->umount_mutex);
+
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&erofs_sb_list_lock);
+ return freed;
+}
+
+int __init erofs_init_shrinker(void)
+{
+ erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
+ if (!erofs_shrinker_info)
+ return -ENOMEM;
+
+ erofs_shrinker_info->count_objects = erofs_shrink_count;
+ erofs_shrinker_info->scan_objects = erofs_shrink_scan;
+ shrinker_register(erofs_shrinker_info);
+ return 0;
+}
+
+void erofs_exit_shrinker(void)
+{
+ shrinker_free(erofs_shrinker_info);
+}