diff options
Diffstat (limited to 'fs/erofs')
-rw-r--r-- | fs/erofs/Kconfig | 53 | ||||
-rw-r--r-- | fs/erofs/Makefile | 6 | ||||
-rw-r--r-- | fs/erofs/compress.h | 67 | ||||
-rw-r--r-- | fs/erofs/data.c | 179 | ||||
-rw-r--r-- | fs/erofs/decompressor.c | 229 | ||||
-rw-r--r-- | fs/erofs/decompressor_deflate.c | 146 | ||||
-rw-r--r-- | fs/erofs/decompressor_lzma.c | 165 | ||||
-rw-r--r-- | fs/erofs/decompressor_zstd.c | 225 | ||||
-rw-r--r-- | fs/erofs/dir.c | 37 | ||||
-rw-r--r-- | fs/erofs/erofs_fs.h | 20 | ||||
-rw-r--r-- | fs/erofs/fileio.c | 197 | ||||
-rw-r--r-- | fs/erofs/fscache.c | 308 | ||||
-rw-r--r-- | fs/erofs/inode.c | 274 | ||||
-rw-r--r-- | fs/erofs/internal.h | 141 | ||||
-rw-r--r-- | fs/erofs/namei.c | 6 | ||||
-rw-r--r-- | fs/erofs/pcpubuf.c | 148 | ||||
-rw-r--r-- | fs/erofs/super.c | 369 | ||||
-rw-r--r-- | fs/erofs/sysfs.c | 30 | ||||
-rw-r--r-- | fs/erofs/utils.c | 282 | ||||
-rw-r--r-- | fs/erofs/xattr.c | 41 | ||||
-rw-r--r-- | fs/erofs/xattr.h | 4 | ||||
-rw-r--r-- | fs/erofs/zdata.c | 1232 | ||||
-rw-r--r-- | fs/erofs/zmap.c | 358 | ||||
-rw-r--r-- | fs/erofs/zutil.c | 317 |
24 files changed, 2592 insertions, 2242 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index f6dc961e6c2b..6ea60661fa55 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -21,7 +21,7 @@ config EROFS_FS performance under extremely memory pressure without extra cost. See the documentation at <file:Documentation/filesystems/erofs.rst> - for more details. + and the web pages at <https://erofs.docs.kernel.org> for more details. If unsure, say N. @@ -74,6 +74,23 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_ZIP bool "EROFS Data Compression Support" depends on EROFS_FS @@ -91,13 +108,10 @@ config EROFS_FS_ZIP_LZMA select XZ_DEC_MICROLZMA help Saying Y here includes support for reading EROFS file systems - containing LZMA compressed data, specifically called microLZMA. it - gives better compression ratios than the LZ4 algorithm, at the + containing LZMA compressed data, specifically called microLZMA. It + gives better compression ratios than the default LZ4 format, at the expense of more CPU overhead. - LZMA support is an experimental feature for now and so most file - systems will be readable without selecting this option. - If unsure, say N. config EROFS_FS_ZIP_DEFLATE @@ -115,14 +129,35 @@ config EROFS_FS_ZIP_DEFLATE If unsure, say N. +config EROFS_FS_ZIP_ZSTD + bool "EROFS Zstandard compressed data support" + depends on EROFS_FS_ZIP + select ZSTD_DECOMPRESS + help + Saying Y here includes support for reading EROFS file systems + containing Zstandard compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + Zstandard support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config EROFS_FS_ONDEMAND - bool "EROFS fscache-based on-demand read support" - depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) - default n + bool "EROFS fscache-based on-demand read support (deprecated)" + depends on EROFS_FS + select NETFS_SUPPORT + select FSCACHE + select CACHEFILES + select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand read support. + It is now deprecated and scheduled to be removed from the kernel + after fanotify pre-content hooks are landed. + If unsure, say N. config EROFS_FS_PCPU_KTHREAD diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 994d0b9deddf..4331d53c7109 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,9 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o +erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o -erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o +erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 279933e007d2..7bfe251680ec 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,13 +11,12 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { @@ -25,6 +24,8 @@ struct z_erofs_decompressor { void *data, int size); int (*decompress)(struct z_erofs_decompress_req *rq, struct page **pagepool); + int (*init)(void); + void (*exit)(void); char *name; }; @@ -53,17 +54,14 @@ struct z_erofs_decompressor { */ /* - * short-lived pages are pages directly from buddy system with specific - * page->private (no need to set PagePrivate since these are non-LRU / - * non-movable pages and bypass reclaim / migration code). + * Currently, short-lived pages are pages directly from buddy system + * with specific page->private (Z_EROFS_SHORTLIVED_PAGE). + * In the future world of Memdescs, it should be type 0 (Misc) memory + * which type can be checked with a new helper. */ static inline bool z_erofs_is_shortlived_page(struct page *page) { - if (page->private != Z_EROFS_SHORTLIVED_PAGE) - return false; - - DBG_BUGON(page->mapping); - return true; + return page->private == Z_EROFS_SHORTLIVED_PAGE; } static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, @@ -71,35 +69,32 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, { if (!z_erofs_is_shortlived_page(page)) return false; - - /* short-lived pages should not be used by others at the same time */ - if (page_ref_count(page) > 1) { - put_page(page); - } else { - /* follow the pcluster rule above. */ - erofs_pagepool_add(pagepool, page); - } + erofs_pagepool_add(pagepool, page); return true; } -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} +extern const struct z_erofs_decompressor z_erofs_lzma_decomp; +extern const struct z_erofs_decompressor z_erofs_deflate_decomp; +extern const struct z_erofs_decompressor z_erofs_zstd_decomp; +extern const struct z_erofs_decompressor *z_erofs_decomp[]; + +struct z_erofs_stream_dctx { + struct z_erofs_decompress_req *rq; + unsigned int inpages, outpages; /* # of {en,de}coded pages */ + int no, ni; /* the current {en,de}coded page # */ + unsigned int avail_out; /* remaining bytes in the decoded buffer */ + unsigned int inbuf_pos, inbuf_sz; + /* current status of the encoded buffer */ + u8 *kin, *kout; /* buffer mapped pointers */ + void *bounce; /* bounce buffer for inplace I/Os */ + bool bounced; /* is the bounce buffer used now? */ +}; + +int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, + void **src, struct page **pgpl); int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -extern const struct z_erofs_decompressor erofs_decompressors[]; - -/* prototypes for specific algorithms */ -int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, void *data, int size); -int z_erofs_load_deflate_config(struct super_block *sb, - struct erofs_super_block *dsb, void *data, int size); -int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); -int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +int __init z_erofs_init_decompressor(void); +void z_erofs_exit_decompressor(void); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 19ab9bb3a9a0..91182d5e3a66 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -5,9 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" -#include <linux/prefetch.h> #include <linux/sched/mm.h> -#include <linux/dax.h> #include <trace/events/erofs.h> void erofs_unmap_metabuf(struct erofs_buf *buf) @@ -23,40 +21,32 @@ void erofs_put_metabuf(struct erofs_buf *buf) if (!buf->page) return; erofs_unmap_metabuf(buf); - put_page(buf->page); + folio_put(page_folio(buf->page)); buf->page = NULL; } -/* - * Derive the block size from inode->i_blkbits to make compatible with - * anonymous inode in fscache mode. - */ -void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type) { - struct inode *inode = buf->inode; - erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; pgoff_t index = offset >> PAGE_SHIFT; - struct page *page = buf->page; - struct folio *folio; - unsigned int nofs_flag; + struct folio *folio = NULL; - if (!page || page->index != index) { + if (buf->page) { + folio = page_folio(buf->page); + if (folio_file_page(folio, index) != buf->page) + erofs_unmap_metabuf(buf); + } + if (!folio || !folio_contains(folio, index)) { erofs_put_metabuf(buf); - - nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(inode->i_mapping, index, NULL, NULL); - memalloc_nofs_restore(nofs_flag); + folio = read_mapping_folio(buf->mapping, index, buf->file); if (IS_ERR(folio)) return folio; - - /* should already be PageUptodate, no need to lock page */ - page = folio_file_page(folio, index); - buf->page = page; } + buf->page = folio_file_page(folio, index); + if (buf->kmap_type == EROFS_NO_KMAP) { if (type == EROFS_KMAP) - buf->base = kmap_local_page(page); + buf->base = kmap_local_page(buf->page); buf->kmap_type = type; } else if (buf->kmap_type != type) { DBG_BUGON(1); @@ -69,54 +59,50 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - if (erofs_is_fscache_mode(sb)) - buf->inode = EROFS_SB(sb)->s_fscache->inode; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + buf->file = NULL; + if (erofs_is_fileio_mode(sbi)) { + buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ + buf->mapping = buf->file->f_mapping; + } else if (erofs_is_fscache_mode(sb)) + buf->mapping = sbi->dif0.fscache->inode->i_mapping; else - buf->inode = sb->s_bdev->bd_inode; + buf->mapping = sb->s_bdev->bd_mapping; } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type) + erofs_off_t offset, enum erofs_kmap_type type) { erofs_init_metabuf(buf, sb); - return erofs_bread(buf, blkaddr, type); + return erofs_bread(buf, offset, type); } static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map) { - erofs_blk_t nblocks, lastblk; - u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - nblocks = erofs_iblks(inode); - lastblk = nblocks - tailendpacking; - - /* there is no hole in flatmode */ - map->m_flags = EROFS_MAP_MAPPED; - if (offset < erofs_pos(sb, lastblk)) { + map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ + if (map->m_la < erofs_pos(sb, lastblk)) { map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - offset; - } else if (tailendpacking) { + map->m_plen = erofs_pos(sb, lastblk) - map->m_la; + } else { + DBG_BUGON(!tailendpacking); map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, offset); - map->m_plen = inode->i_size - offset; + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_plen = inode->i_size - map->m_la; /* inline data should be located in the same meta block */ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data cross block boundary @ nid %llu", - vi->nid); + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; - } else { - erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", - vi->nid, inode->i_size, map->m_la); - DBG_BUGON(1); - return -EIO; } return 0; } @@ -138,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; - map->m_plen = 0; + map->m_plen = map->m_llen; goto out; } @@ -156,7 +142,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out; @@ -167,7 +153,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos); + __le32 *blkaddr = kaddr; if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; @@ -178,7 +164,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) goto out_unlock; } /* parse chunk indexes */ - idx = kaddr + erofs_blkoff(sb, pos); + idx = kaddr; switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; @@ -199,17 +185,25 @@ out: return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct super_block *sb, struct erofs_device_info *dif) +{ + map->m_sb = sb; + map->m_dif = dif; + map->m_bdev = NULL; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff, length; int id; - map->m_bdev = sb->s_bdev; - map->m_daxdev = EROFS_SB(sb)->dax_dev; - map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; - map->m_fscache = EROFS_SB(sb)->s_fscache; - + erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); + map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); @@ -222,29 +216,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - if (!dif->mapped_blkaddr) continue; + startoff = erofs_pos(sb, dif->mapped_blkaddr); length = erofs_pos(sb, dif->blocks); - if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_handle ? - dif->bdev_handle->bdev : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, sb, dif); break; } } @@ -253,6 +238,48 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this folio + * bit 29: CPU has dirty data in D-cache (needs aliasing handling); + * bit 0 - 29: remaining parts to complete this folio + */ +#define EROFS_ONLINEFOLIO_EIO 30 +#define EROFS_ONLINEFOLIO_DIRTY 29 + +void erofs_onlinefolio_init(struct folio *folio) +{ + union { + atomic_t o; + void *v; + } u = { .o = ATOMIC_INIT(1) }; + + folio->private = u.v; /* valid only if file-backed folio is locked */ +} + +void erofs_onlinefolio_split(struct folio *folio) +{ + atomic_inc((atomic_t *)&folio->private); +} + +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty) +{ + int orig, v; + + do { + orig = atomic_read((atomic_t *)&folio->private); + DBG_BUGON(orig <= 0); + v = dirty << EROFS_ONLINEFOLIO_DIRTY; + v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO); + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); + + if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1)) + return; + folio->private = 0; + if (v & BIT(EROFS_ONLINEFOLIO_DIRTY)) + flush_dcache_folio(folio); + folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO))); +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { @@ -278,7 +305,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->offset = map.m_la; if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_daxdev; + iomap->dax_dev = mdev.m_dif->dax_dev; else iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; @@ -298,17 +325,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, sb, - erofs_blknr(sb, mdev.m_pa), EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); - iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa); + iomap->inline_data = ptr; iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dax_part_off; + iomap->addr += mdev.m_dif->dax_part_off; } return 0; } @@ -358,11 +384,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { + trace_erofs_read_folio(folio, true); + return iomap_read_folio(folio, &erofs_iomap_ops); } static void erofs_readahead(struct readahead_control *rac) { + trace_erofs_readahead(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + return iomap_readahead(rac, &erofs_iomap_ops); } @@ -403,7 +434,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .read_folio = erofs_read_folio, .readahead = erofs_readahead, .bmap = erofs_bmap, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index aa59788a61e6..dc61a6a8f696 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -2,9 +2,9 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2024 Alibaba Cloud */ #include "compress.h" -#include <linux/module.h> #include <linux/lz4.h> #ifndef LZ4_DISTANCE_MAX /* history window size */ @@ -55,7 +55,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb, sbi->lz4.max_distance_pages = distance ? DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : LZ4_MAX_DISTANCE_PAGES; - return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); + return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks); } /* @@ -110,10 +110,10 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (top) { victim = availables[--top]; - get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = __erofs_allocpage(pagepool, rq->gfp, true); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = erofs_get_pcpubuf(ctx->inpages); + src = z_erofs_get_gbuf(ctx->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); @@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { - erofs_put_pcpubuf(src); + z_erofs_put_gbuf(src); } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; @@ -315,73 +315,165 @@ dstmap_out: static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = + const unsigned int nrpages_in = + PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; + const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - u8 *src; - - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; + + if (rq->outputsize > rq->inputsize) + return -EOPNOTSUPP; + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) + memmove(kin + rq->pageofs_out, kin + pi, cur); + else + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + kunmap_local(kin); + } + rq->outputsize -= cur; } - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; + for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min(insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + else if (rq->out[no]) + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + pi += cnt; + } while (pi < insz); + kunmap_local(kin); + } + DBG_BUGON(ni > nrpages_in); + return 0; +} + +int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, + void **src, struct page **pgpl) +{ + struct z_erofs_decompress_req *rq = dctx->rq; + struct super_block *sb = rq->sb; + struct page **pgo, *tmppage; + unsigned int j; + + if (!dctx->avail_out) { + if (++dctx->no >= dctx->outpages || !rq->outputsize) { + erofs_err(sb, "insufficient space for decompressed data"); + return -EFSCORRUPTED; + } + + if (dctx->kout) + kunmap_local(dctx->kout); + dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out); + rq->outputsize -= dctx->avail_out; + pgo = &rq->out[dctx->no]; + if (!*pgo && rq->fillgaps) { /* deduped */ + *pgo = erofs_allocpage(pgpl, rq->gfp); + if (!*pgo) { + dctx->kout = NULL; + return -ENOMEM; + } + set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE); + } + if (*pgo) { + dctx->kout = kmap_local_page(*pgo); + *dst = dctx->kout + rq->pageofs_out; + } else { + *dst = dctx->kout = NULL; + } + rq->pageofs_out = 0; } - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) - memcpy_to_page(rq->out[0], rq->pageofs_out, - src + interlaced_offset, righthalf); - - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - memcpy_to_page(rq->out[outpages - 1], 0, src + - (interlaced_offset ? 0 : righthalf), - lefthalf); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); - flush_dcache_page(rq->in[inpages - 1]); + if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) { + if (++dctx->ni >= dctx->inpages) { + erofs_err(sb, "invalid compressed data"); + return -EFSCORRUPTED; + } + if (dctx->kout) /* unlike kmap(), take care of the orders */ + kunmap_local(dctx->kout); + kunmap_local(dctx->kin); + + dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE); + rq->inputsize -= dctx->inbuf_sz; + dctx->kin = kmap_local_page(rq->in[dctx->ni]); + *src = dctx->kin; + dctx->bounced = false; + if (dctx->kout) { + j = (u8 *)*dst - dctx->kout; + dctx->kout = kmap_local_page(rq->out[dctx->no]); + *dst = dctx->kout + j; } + dctx->inbuf_pos = 0; + } + + /* + * Handle overlapping: Use the given bounce buffer if the input data is + * under processing; Or utilize short-lived pages from the on-stack page + * pool, where pages are shared among the same request. Note that only + * a few inplace I/O pages need to be doubled. + */ + if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) { + memcpy(dctx->bounce, *src, dctx->inbuf_sz); + *src = dctx->bounce; + dctx->bounced = true; + } + + for (j = dctx->ni + 1; j < dctx->inpages; ++j) { + if (rq->out[dctx->no] != rq->in[j]) + continue; + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) + return -ENOMEM; + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; } - kunmap_local(src); return 0; } -const struct z_erofs_decompressor erofs_decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { +const struct z_erofs_decompressor *z_erofs_decomp[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) { .decompress = z_erofs_transform_plain, .name = "shifted" }, - [Z_EROFS_COMPRESSION_INTERLACED] = { + [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) { .decompress = z_erofs_transform_plain, .name = "interlaced" }, - [Z_EROFS_COMPRESSION_LZ4] = { + [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) { .config = z_erofs_load_lz4_config, .decompress = z_erofs_lz4_decompress, + .init = z_erofs_gbuf_init, + .exit = z_erofs_gbuf_exit, .name = "lz4" }, #ifdef CONFIG_EROFS_FS_ZIP_LZMA - [Z_EROFS_COMPRESSION_LZMA] = { - .config = z_erofs_load_lzma_config, - .decompress = z_erofs_lzma_decompress, - .name = "lzma" - }, + [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp, #endif #ifdef CONFIG_EROFS_FS_ZIP_DEFLATE - [Z_EROFS_COMPRESSION_DEFLATE] = { - .config = z_erofs_load_deflate_config, - .decompress = z_erofs_deflate_decompress, - .name = "deflate" - }, + [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp, +#endif +#ifdef CONFIG_EROFS_FS_ZIP_ZSTD + [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp, #endif }; @@ -409,6 +501,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) offset = EROFS_SUPER_OFFSET + sbi->sb_size; alg = 0; for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + const struct z_erofs_decompressor *dec = z_erofs_decomp[alg]; void *data; if (!(algs & 1)) @@ -420,16 +513,13 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) break; } - if (alg >= ARRAY_SIZE(erofs_decompressors) || - !erofs_decompressors[alg].config) { + if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) { + ret = dec->config(sb, dsb, data, size); + } else { erofs_err(sb, "algorithm %d isn't enabled on this kernel", alg); ret = -EOPNOTSUPP; - } else { - ret = erofs_decompressors[alg].config(sb, - dsb, data, size); } - kfree(data); if (ret) break; @@ -437,3 +527,28 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) erofs_put_metabuf(&buf); return ret; } + +int __init z_erofs_init_decompressor(void) +{ + int i, err; + + for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) { + err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0; + if (err) { + while (i--) + if (z_erofs_decomp[i]) + z_erofs_decomp[i]->exit(); + return err; + } + } + return 0; +} + +void z_erofs_exit_decompressor(void) +{ + int i; + + for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) + if (z_erofs_decomp[i]) + z_erofs_decomp[i]->exit(); +} diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index aac2c837ef35..5070d2fcc737 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/module.h> #include <linux/zlib.h> #include "compress.h" @@ -16,7 +15,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); -void z_erofs_deflate_exit(void) +static void z_erofs_deflate_exit(void) { /* there should be no running fs instance */ while (z_erofs_deflate_avail_strms) { @@ -42,7 +41,7 @@ void z_erofs_deflate_exit(void) } } -int __init z_erofs_deflate_init(void) +static int __init z_erofs_deflate_init(void) { /* by default, use # of possible CPUs instead */ if (!z_erofs_deflate_nstrms) @@ -50,7 +49,7 @@ int __init z_erofs_deflate_init(void) return 0; } -int z_erofs_load_deflate_config(struct super_block *sb, +static int z_erofs_load_deflate_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { struct z_erofs_deflate_cfgs *dfl = data; @@ -98,27 +97,26 @@ failed: return -ENOMEM; } -int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) { - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int nrpages_in = - PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; struct super_block *sb = rq->sb; - unsigned int insz, outsz, pofs; + struct z_erofs_stream_dctx dctx = { + .rq = rq, + .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, + .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) + >> PAGE_SHIFT, + .no = -1, .ni = 0, + }; struct z_erofs_deflate *strm; - u8 *kin, *kout = NULL; - bool bounced = false; - int no = -1, ni = 0, j = 0, zerr, err; + int zerr, err; /* 1. get the exact DEFLATE compressed size */ - kin = kmap_local_page(*rq->in); - err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, - min_t(unsigned int, rq->inputsize, - sb->s_blocksize - rq->pageofs_in)); + dctx.kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in, + min(rq->inputsize, sb->s_blocksize - rq->pageofs_in)); if (err) { - kunmap_local(kin); + kunmap_local(dctx.kin); return err; } @@ -135,98 +133,35 @@ again: spin_unlock(&z_erofs_deflate_lock); /* 3. multi-call decompress */ - insz = rq->inputsize; - outsz = rq->outputsize; zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); if (zerr != Z_OK) { err = -EIO; goto failed_zinit; } - pofs = rq->pageofs_out; - strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); - insz -= strm->z.avail_in; - strm->z.next_in = kin + rq->pageofs_in; + rq->fillgaps = true; /* DEFLATE doesn't support NULL output buffer */ + strm->z.avail_in = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in); + rq->inputsize -= strm->z.avail_in; + strm->z.next_in = dctx.kin + rq->pageofs_in; strm->z.avail_out = 0; + dctx.bounce = strm->bounce; while (1) { - if (!strm->z.avail_out) { - if (++no >= nrpages_out || !outsz) { - erofs_err(sb, "insufficient space for decompressed data"); - err = -EFSCORRUPTED; - break; - } - - if (kout) - kunmap_local(kout); - strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); - outsz -= strm->z.avail_out; - if (!rq->out[no]) { - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); - set_page_private(rq->out[no], - Z_EROFS_SHORTLIVED_PAGE); - } - kout = kmap_local_page(rq->out[no]); - strm->z.next_out = kout + pofs; - pofs = 0; - } - - if (!strm->z.avail_in && insz) { - if (++ni >= nrpages_in) { - erofs_err(sb, "invalid compressed data"); - err = -EFSCORRUPTED; - break; - } - - if (kout) { /* unlike kmap(), take care of the orders */ - j = strm->z.next_out - kout; - kunmap_local(kout); - } - kunmap_local(kin); - strm->z.avail_in = min_t(u32, insz, PAGE_SIZE); - insz -= strm->z.avail_in; - kin = kmap_local_page(rq->in[ni]); - strm->z.next_in = kin; - bounced = false; - if (kout) { - kout = kmap_local_page(rq->out[no]); - strm->z.next_out = kout + j; - } - } - - /* - * Handle overlapping: Use bounced buffer if the compressed - * data is under processing; Or use short-lived pages from the - * on-stack pagepool where pages share among the same request - * and not _all_ inplace I/O pages are needed to be doubled. - */ - if (!bounced && rq->out[no] == rq->in[ni]) { - memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in); - strm->z.next_in = strm->bounce; - bounced = true; - } - - for (j = ni + 1; j < nrpages_in; ++j) { - struct page *tmppage; - - if (rq->out[no] != rq->in[j]) - continue; - - DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), - rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); - set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); - copy_highpage(tmppage, rq->in[j]); - rq->in[j] = tmppage; - } + dctx.avail_out = strm->z.avail_out; + dctx.inbuf_sz = strm->z.avail_in; + err = z_erofs_stream_switch_bufs(&dctx, + (void **)&strm->z.next_out, + (void **)&strm->z.next_in, pgpl); + if (err) + break; + strm->z.avail_out = dctx.avail_out; + strm->z.avail_in = dctx.inbuf_sz; zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); - if (zerr != Z_OK || !(outsz + strm->z.avail_out)) { + if (zerr != Z_OK || !(rq->outputsize + strm->z.avail_out)) { if (zerr == Z_OK && rq->partial_decoding) break; - if (zerr == Z_STREAM_END && !outsz) + if (zerr == Z_STREAM_END && !rq->outputsize) break; erofs_err(sb, "failed to decompress %d in[%u] out[%u]", zerr, rq->inputsize, rq->outputsize); @@ -234,13 +169,12 @@ again: break; } } - if (zlib_inflateEnd(&strm->z) != Z_OK && !err) err = -EIO; - if (kout) - kunmap_local(kout); + if (dctx.kout) + kunmap_local(dctx.kout); failed_zinit: - kunmap_local(kin); + kunmap_local(dctx.kin); /* 4. push back DEFLATE stream context to the global list */ spin_lock(&z_erofs_deflate_lock); strm->next = z_erofs_deflate_head; @@ -249,3 +183,11 @@ failed_zinit: wake_up(&z_erofs_deflate_wq); return err; } + +const struct z_erofs_decompressor z_erofs_deflate_decomp = { + .config = z_erofs_load_deflate_config, + .decompress = z_erofs_deflate_decompress, + .init = z_erofs_deflate_init, + .exit = z_erofs_deflate_exit, + .name = "deflate", +}; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index ba4ec73f4aae..40666815046f 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -1,12 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/xz.h> -#include <linux/module.h> #include "compress.h" struct z_erofs_lzma { struct z_erofs_lzma *next; struct xz_dec_microlzma *state; - struct xz_buf buf; u8 bounce[PAGE_SIZE]; }; @@ -19,7 +17,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); -void z_erofs_lzma_exit(void) +static void z_erofs_lzma_exit(void) { /* there should be no running fs instance */ while (z_erofs_lzma_avail_strms) { @@ -47,7 +45,7 @@ void z_erofs_lzma_exit(void) } } -int __init z_erofs_lzma_init(void) +static int __init z_erofs_lzma_init(void) { unsigned int i; @@ -71,7 +69,7 @@ int __init z_erofs_lzma_init(void) return 0; } -int z_erofs_load_lzma_config(struct super_block *sb, +static int z_erofs_load_lzma_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { static DEFINE_MUTEX(lzma_resize_mutex); @@ -96,8 +94,6 @@ int z_erofs_load_lzma_config(struct super_block *sb, return -EINVAL; } - erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); - /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ mutex_lock(&lzma_resize_mutex); @@ -150,26 +146,28 @@ again: return err; } -int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) { - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int nrpages_in = - PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - unsigned int inlen, outlen, pageofs; + struct super_block *sb = rq->sb; + struct z_erofs_stream_dctx dctx = { + .rq = rq, + .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, + .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) + >> PAGE_SHIFT, + .no = -1, .ni = 0, + }; + struct xz_buf buf = {}; struct z_erofs_lzma *strm; - u8 *kin; - bool bounced = false; - int no, ni, j, err = 0; + enum xz_ret xz_err; + int err; /* 1. get the exact LZMA compressed size */ - kin = kmap(*rq->in); - err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, - min_t(unsigned int, rq->inputsize, - rq->sb->s_blocksize - rq->pageofs_in)); + dctx.kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in, + min(rq->inputsize, sb->s_blocksize - rq->pageofs_in)); if (err) { - kunmap(*rq->in); + kunmap_local(dctx.kin); return err; } @@ -186,104 +184,45 @@ again: spin_unlock(&z_erofs_lzma_lock); /* 3. multi-call decompress */ - inlen = rq->inputsize; - outlen = rq->outputsize; - xz_dec_microlzma_reset(strm->state, inlen, outlen, + xz_dec_microlzma_reset(strm->state, rq->inputsize, rq->outputsize, !rq->partial_decoding); - pageofs = rq->pageofs_out; - strm->buf.in = kin + rq->pageofs_in; - strm->buf.in_pos = 0; - strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in); - inlen -= strm->buf.in_size; - strm->buf.out = NULL; - strm->buf.out_pos = 0; - strm->buf.out_size = 0; - - for (ni = 0, no = -1;;) { - enum xz_ret xz_err; - - if (strm->buf.out_pos == strm->buf.out_size) { - if (strm->buf.out) { - kunmap(rq->out[no]); - strm->buf.out = NULL; - } - - if (++no >= nrpages_out || !outlen) { - erofs_err(rq->sb, "decompressed buf out of bound"); - err = -EFSCORRUPTED; - break; - } - strm->buf.out_pos = 0; - strm->buf.out_size = min_t(u32, outlen, - PAGE_SIZE - pageofs); - outlen -= strm->buf.out_size; - if (!rq->out[no] && rq->fillgaps) { /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); - set_page_private(rq->out[no], - Z_EROFS_SHORTLIVED_PAGE); - } - if (rq->out[no]) - strm->buf.out = kmap(rq->out[no]) + pageofs; - pageofs = 0; - } else if (strm->buf.in_pos == strm->buf.in_size) { - kunmap(rq->in[ni]); - - if (++ni >= nrpages_in || !inlen) { - erofs_err(rq->sb, "compressed buf out of bound"); - err = -EFSCORRUPTED; - break; - } - strm->buf.in_pos = 0; - strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); - inlen -= strm->buf.in_size; - kin = kmap(rq->in[ni]); - strm->buf.in = kin; - bounced = false; - } + buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in); + rq->inputsize -= buf.in_size; + buf.in = dctx.kin + rq->pageofs_in; + dctx.bounce = strm->bounce; + do { + dctx.avail_out = buf.out_size - buf.out_pos; + dctx.inbuf_sz = buf.in_size; + dctx.inbuf_pos = buf.in_pos; + err = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out, + (void **)&buf.in, pgpl); + if (err) + break; - /* - * Handle overlapping: Use bounced buffer if the compressed - * data is under processing; Otherwise, Use short-lived pages - * from the on-stack pagepool where pages share with the same - * request. - */ - if (!bounced && rq->out[no] == rq->in[ni]) { - memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); - strm->buf.in = strm->bounce; - bounced = true; + if (buf.out_size == buf.out_pos) { + buf.out_size = dctx.avail_out; + buf.out_pos = 0; } - for (j = ni + 1; j < nrpages_in; ++j) { - struct page *tmppage; + buf.in_size = dctx.inbuf_sz; + buf.in_pos = dctx.inbuf_pos; - if (rq->out[no] != rq->in[j]) - continue; - - DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), - rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); - set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); - copy_highpage(tmppage, rq->in[j]); - rq->in[j] = tmppage; - } - xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); - DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); - DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + xz_err = xz_dec_microlzma_run(strm->state, &buf); + DBG_BUGON(buf.out_pos > buf.out_size); + DBG_BUGON(buf.in_pos > buf.in_size); if (xz_err != XZ_OK) { - if (xz_err == XZ_STREAM_END && !outlen) + if (xz_err == XZ_STREAM_END && !rq->outputsize) break; - erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + erofs_err(sb, "failed to decompress %d in[%u] out[%u]", xz_err, rq->inputsize, rq->outputsize); err = -EFSCORRUPTED; break; } - } - if (no < nrpages_out && strm->buf.out) - kunmap(rq->out[no]); - if (ni < nrpages_in) - kunmap(rq->in[ni]); + } while (1); + + if (dctx.kout) + kunmap_local(dctx.kout); + kunmap_local(dctx.kin); /* 4. push back LZMA stream context to the global list */ spin_lock(&z_erofs_lzma_lock); strm->next = z_erofs_lzma_head; @@ -292,3 +231,11 @@ again: wake_up(&z_erofs_lzma_wq); return err; } + +const struct z_erofs_decompressor z_erofs_lzma_decomp = { + .config = z_erofs_load_lzma_config, + .decompress = z_erofs_lzma_decompress, + .init = z_erofs_lzma_init, + .exit = z_erofs_lzma_exit, + .name = "lzma" +}; diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c new file mode 100644 index 000000000000..7e177304967e --- /dev/null +++ b/fs/erofs/decompressor_zstd.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/zstd.h> +#include "compress.h" + +struct z_erofs_zstd { + struct z_erofs_zstd *next; + u8 bounce[PAGE_SIZE]; + void *wksp; + unsigned int wkspsz; +}; + +static DEFINE_SPINLOCK(z_erofs_zstd_lock); +static unsigned int z_erofs_zstd_max_dictsize; +static unsigned int z_erofs_zstd_nstrms, z_erofs_zstd_avail_strms; +static struct z_erofs_zstd *z_erofs_zstd_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_zstd_wq); + +module_param_named(zstd_streams, z_erofs_zstd_nstrms, uint, 0444); + +static struct z_erofs_zstd *z_erofs_isolate_strms(bool all) +{ + struct z_erofs_zstd *strm; + +again: + spin_lock(&z_erofs_zstd_lock); + strm = z_erofs_zstd_head; + if (!strm) { + spin_unlock(&z_erofs_zstd_lock); + wait_event(z_erofs_zstd_wq, READ_ONCE(z_erofs_zstd_head)); + goto again; + } + z_erofs_zstd_head = all ? NULL : strm->next; + spin_unlock(&z_erofs_zstd_lock); + return strm; +} + +static void z_erofs_zstd_exit(void) +{ + while (z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *strm, *n; + + for (strm = z_erofs_isolate_strms(true); strm; strm = n) { + n = strm->next; + + kvfree(strm->wksp); + kfree(strm); + --z_erofs_zstd_avail_strms; + } + } +} + +static int __init z_erofs_zstd_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_zstd_nstrms) + z_erofs_zstd_nstrms = num_possible_cpus(); + + for (; z_erofs_zstd_avail_strms < z_erofs_zstd_nstrms; + ++z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) { + z_erofs_zstd_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_zstd_lock); + strm->next = z_erofs_zstd_head; + z_erofs_zstd_head = strm; + spin_unlock(&z_erofs_zstd_lock); + } + return 0; +} + +static int z_erofs_load_zstd_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + static DEFINE_MUTEX(zstd_resize_mutex); + struct z_erofs_zstd_cfgs *zstd = data; + unsigned int dict_size, wkspsz; + struct z_erofs_zstd *strm, *head = NULL; + void *wksp; + + if (!zstd || size < sizeof(struct z_erofs_zstd_cfgs) || zstd->format) { + erofs_err(sb, "unsupported zstd format, size=%u", size); + return -EINVAL; + } + + if (zstd->windowlog > ilog2(Z_EROFS_ZSTD_MAX_DICT_SIZE) - 10) { + erofs_err(sb, "unsupported zstd window log %u", zstd->windowlog); + return -EINVAL; + } + dict_size = 1U << (zstd->windowlog + 10); + + /* in case 2 z_erofs_load_zstd_config() race to avoid deadlock */ + mutex_lock(&zstd_resize_mutex); + if (z_erofs_zstd_max_dictsize >= dict_size) { + mutex_unlock(&zstd_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + while (z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *n; + + for (strm = z_erofs_isolate_strms(true); strm; strm = n) { + n = strm->next; + strm->next = head; + head = strm; + --z_erofs_zstd_avail_strms; + } + } + + /* 2. walk each isolated stream and grow max dict_size if needed */ + wkspsz = zstd_dstream_workspace_bound(dict_size); + for (strm = head; strm; strm = strm->next) { + wksp = kvmalloc(wkspsz, GFP_KERNEL); + if (!wksp) + break; + kvfree(strm->wksp); + strm->wksp = wksp; + strm->wkspsz = wkspsz; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_zstd_lock); + DBG_BUGON(z_erofs_zstd_head); + z_erofs_zstd_head = head; + spin_unlock(&z_erofs_zstd_lock); + z_erofs_zstd_avail_strms = z_erofs_zstd_nstrms; + wake_up_all(&z_erofs_zstd_wq); + if (!strm) + z_erofs_zstd_max_dictsize = dict_size; + mutex_unlock(&zstd_resize_mutex); + return strm ? -ENOMEM : 0; +} + +static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ + struct super_block *sb = rq->sb; + struct z_erofs_stream_dctx dctx = { + .rq = rq, + .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, + .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) + >> PAGE_SHIFT, + .no = -1, .ni = 0, + }; + zstd_in_buffer in_buf = { NULL, 0, 0 }; + zstd_out_buffer out_buf = { NULL, 0, 0 }; + struct z_erofs_zstd *strm; + zstd_dstream *stream; + int zerr, err; + + /* 1. get the exact compressed size */ + dctx.kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in, + min(rq->inputsize, sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(dctx.kin); + return err; + } + + /* 2. get an available ZSTD context */ + strm = z_erofs_isolate_strms(false); + + /* 3. multi-call decompress */ + stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz); + if (!stream) { + err = -EIO; + goto failed_zinit; + } + + rq->fillgaps = true; /* ZSTD doesn't support NULL output buffer */ + in_buf.size = min_t(u32, rq->inputsize, PAGE_SIZE - rq->pageofs_in); + rq->inputsize -= in_buf.size; + in_buf.src = dctx.kin + rq->pageofs_in; + dctx.bounce = strm->bounce; + + do { + dctx.avail_out = out_buf.size - out_buf.pos; + dctx.inbuf_sz = in_buf.size; + dctx.inbuf_pos = in_buf.pos; + err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst, + (void **)&in_buf.src, pgpl); + if (err) + break; + + if (out_buf.size == out_buf.pos) { + out_buf.size = dctx.avail_out; + out_buf.pos = 0; + } + in_buf.size = dctx.inbuf_sz; + in_buf.pos = dctx.inbuf_pos; + + zerr = zstd_decompress_stream(stream, &out_buf, &in_buf); + if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) { + erofs_err(sb, "failed to decompress in[%u] out[%u]: %s", + rq->inputsize, rq->outputsize, + zerr ? zstd_get_error_name(zerr) : "unexpected end of stream"); + err = -EFSCORRUPTED; + break; + } + } while (rq->outputsize || out_buf.pos < out_buf.size); + + if (dctx.kout) + kunmap_local(dctx.kout); +failed_zinit: + kunmap_local(dctx.kin); + /* 4. push back ZSTD stream context to the global list */ + spin_lock(&z_erofs_zstd_lock); + strm->next = z_erofs_zstd_head; + z_erofs_zstd_head = strm; + spin_unlock(&z_erofs_zstd_lock); + wake_up(&z_erofs_zstd_wq); + return err; +} + +const struct z_erofs_decompressor z_erofs_zstd_decomp = { + .config = z_erofs_load_zstd_config, + .decompress = z_erofs_zstd_decompress, + .init = z_erofs_zstd_init, + .exit = z_erofs_zstd_exit, + .name = "zstd", +}; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index b80abec0531a..c3b90abdee37 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -8,19 +8,15 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, - unsigned int nameoff, unsigned int maxsize) + unsigned int nameoff0, unsigned int maxsize) { - const struct erofs_dirent *end = dentry_blk + nameoff; + const struct erofs_dirent *end = dentry_blk + nameoff0; while (de < end) { - const char *de_name; + unsigned char d_type = fs_ftype_to_dtype(de->file_type); + unsigned int nameoff = le16_to_cpu(de->nameoff); + const char *de_name = (char *)dentry_blk + nameoff; unsigned int de_namelen; - unsigned char d_type; - - d_type = fs_ftype_to_dtype(de->file_type); - - nameoff = le16_to_cpu(de->nameoff); - de_name = (char *)dentry_blk + nameoff; /* the last dirent in the block? */ if (de + 1 >= end) @@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = dir->i_sb; unsigned long bsz = sb->s_blocksize; - const size_t dirsize = i_size_read(dir); - unsigned int i = erofs_blknr(sb, ctx->pos); unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; - buf.inode = dir; - while (ctx->pos < dirsize) { + buf.mapping = dir->i_mapping; + while (ctx->pos < dir->i_size) { + erofs_off_t dbstart = ctx->pos - ofs; struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, i, EROFS_KMAP); + de = erofs_bread(&buf, dbstart, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", - i, EROFS_I(dir)->nid); + erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; } @@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; } - maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); - + maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; - ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = erofs_pos(sb, i) + ofs; - if (ofs >= nameoff) - goto skip_this; + ctx->pos = dbstart + ofs; } err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); if (err) break; -skip_this: - ctx->pos = erofs_pos(sb, i) + maxsize; - ++i; + ctx->pos = dbstart + maxsize; ofs = 0; } erofs_put_metabuf(&buf); diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index a03ec70ba6f2..c8f2ae845bd2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -288,14 +288,18 @@ struct erofs_dirent { #define EROFS_NAME_LEN 255 -/* maximum supported size of a physical compression cluster */ +/* maximum supported encoded size of a physical compressed cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +/* maximum supported decoded size of a physical compressed cluster */ +#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_DEFLATE = 2, + Z_EROFS_COMPRESSION_ZSTD = 3, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -322,6 +326,15 @@ struct z_erofs_deflate_cfgs { u8 reserved[5]; } __packed; +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_zstd_cfgs { + u8 format; + u8 windowlog; /* windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN(10) */ + u8 reserved[4]; +} __packed; + +#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; @@ -396,8 +409,7 @@ enum { Z_EROFS_LCLUSTER_TYPE_MAX }; -#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2 -#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0 +#define Z_EROFS_LI_LCLUSTER_TYPE_MASK (Z_EROFS_LCLUSTER_TYPE_MAX - 1) /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ #define Z_EROFS_LI_PARTIAL_REF (1 << 15) @@ -451,8 +463,6 @@ static inline void erofs_check_ondisk_layout_definitions(void) sizeof(struct z_erofs_lcluster_index)); BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); - BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) < - Z_EROFS_LCLUSTER_TYPE_MAX - 1); /* exclude old compiler versions like gcc 7.5.0 */ BUILD_BUG_ON(__builtin_constant_p(fmh) ? fmh != cpu_to_le64(1ULL << 63) : 0); diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 000000000000..c865a7a61030 --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include <trace/events/erofs.h> + +struct erofs_fileio_rq { + struct bio_vec bvecs[BIO_MAX_VECS]; + struct bio bio; + struct kiocb iocb; + struct super_block *sb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct folio_iter fi; + + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + if (rq->bio.bi_end_io) { + if (ret < 0 && !rq->bio.bi_status) + rq->bio.bi_status = errno_to_blk_status(ret); + rq->bio.bi_end_io(&rq->bio); + } else { + bio_for_each_folio_all(fi, &rq->bio) { + DBG_BUGON(folio_test_uptodate(fi.folio)); + erofs_onlinefolio_end(fi.folio, ret, false); + } + } + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) + rq->iocb.ki_flags = IOCB_DIRECT; + iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); + rq->iocb.ki_filp = mdev->m_dif->file; + rq->sb = mdev->m_sb; + return rq; +} + +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) +{ + return &erofs_fileio_rq_alloc(mdev)->bio; +} + +void erofs_fileio_submit_bio(struct bio *bio) +{ + return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq, + bio)); +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) +{ + struct inode *inode = folio_inode(folio); + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = folio_size(folio), len, attached = 0; + loff_t pos = folio_pos(folio), ofs; + struct iov_iter iter; + struct bio_vec bv; + int err = 0; + + erofs_onlinefolio_init(folio); + while (cur < end) { + if (!in_range(pos + cur, map->m_la, map->m_llen)) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = folio_pos(folio) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + map->m_pa + ofs, EROFS_KMAP); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + bvec_set_folio(&bv, folio, len, cur); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); + if (copy_to_iter(src, len, &iter) != len) { + erofs_put_metabuf(&buf); + err = -EIO; + break; + } + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + attached = 0; + } + if (!bio_add_folio(&io->rq->bio, folio, len, cur)) + goto io_retry; + if (!attached++) + erofs_onlinefolio_split(folio); + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinefolio_end(folio, err, false); + return err; +} + +static int erofs_fileio_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_read_folio(folio, true); + err = erofs_fileio_scan_folio(&io, folio); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct folio *folio; + int err; + + trace_erofs_readahead(inode, readahead_index(rac), + readahead_count(rac), true); + while ((folio = readahead_folio(rac))) { + err = erofs_fileio_scan_folio(&io, folio); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .read_folio = erofs_fileio_read_folio, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index afc37c9029ce..ce3d8737df85 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -25,9 +25,15 @@ static struct file_system_type erofs_anon_fs_type = { .kill_sb = kill_anon_super, }; -struct erofs_fscache_request { - struct erofs_fscache_request *primary; - struct netfs_cache_resources cache_resources; +struct erofs_fscache_io { + struct netfs_cache_resources cres; + struct iov_iter iter; + netfs_io_terminated_t end_io; + void *private; + refcount_t ref; +}; + +struct erofs_fscache_rq { struct address_space *mapping; /* The mapping being accessed */ loff_t start; /* Start position */ size_t len; /* Length of the request */ @@ -36,44 +42,17 @@ struct erofs_fscache_request { refcount_t ref; }; -static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, - loff_t start, size_t len) +static bool erofs_fscache_io_put(struct erofs_fscache_io *io) { - struct erofs_fscache_request *req; - - req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - req->mapping = mapping; - req->start = start; - req->len = len; - refcount_set(&req->ref, 1); - - return req; -} - -static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, - size_t len) -{ - struct erofs_fscache_request *req; - - /* use primary request for the first submission */ - if (!primary->submitted) { - refcount_inc(&primary->ref); - return primary; - } - - req = erofs_fscache_req_alloc(primary->mapping, - primary->start + primary->submitted, len); - if (!IS_ERR(req)) { - req->primary = primary; - refcount_inc(&primary->ref); - } - return req; + if (!refcount_dec_and_test(&io->ref)) + return false; + if (io->cres.ops) + io->cres.ops->end_operation(&io->cres); + kfree(io); + return true; } -static void erofs_fscache_req_complete(struct erofs_fscache_request *req) +static void erofs_fscache_req_complete(struct erofs_fscache_rq *req) { struct folio *folio; bool failed = req->error; @@ -93,120 +72,196 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req) rcu_read_unlock(); } -static void erofs_fscache_req_put(struct erofs_fscache_request *req) +static void erofs_fscache_req_put(struct erofs_fscache_rq *req) { - if (refcount_dec_and_test(&req->ref)) { - if (req->cache_resources.ops) - req->cache_resources.ops->end_operation(&req->cache_resources); - if (!req->primary) - erofs_fscache_req_complete(req); - else - erofs_fscache_req_put(req->primary); - kfree(req); - } + if (!refcount_dec_and_test(&req->ref)) + return; + erofs_fscache_req_complete(req); + kfree(req); +} + +static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping, + loff_t start, size_t len) +{ + struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL); + + if (!req) + return NULL; + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); + return req; +} + +static void erofs_fscache_req_io_put(struct erofs_fscache_io *io) +{ + struct erofs_fscache_rq *req = io->private; + + if (erofs_fscache_io_put(io)) + erofs_fscache_req_put(req); } -static void erofs_fscache_subreq_complete(void *priv, +static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error, bool was_async) { - struct erofs_fscache_request *req = priv; + struct erofs_fscache_io *io = priv; + struct erofs_fscache_rq *req = io->private; - if (IS_ERR_VALUE(transferred_or_error)) { - if (req->primary) - req->primary->error = transferred_or_error; - else - req->error = transferred_or_error; - } - erofs_fscache_req_put(req); + if (IS_ERR_VALUE(transferred_or_error)) + req->error = transferred_or_error; + erofs_fscache_req_io_put(io); +} + +static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req) +{ + struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL); + + if (!io) + return NULL; + io->end_io = erofs_fscache_req_end_io; + io->private = req; + refcount_inc(&req->ref); + refcount_set(&io->ref, 1); + return io; } /* - * Read data from fscache (cookie, pstart, len), and fill the read data into - * page cache described by (req->mapping, lstart, len). @pstart describeis the - * start physical address in the cache file. + * Read data from fscache described by cookie at pstart physical address + * offset, and fill the read data into buffer described by io->iter. */ -static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct erofs_fscache_request *req, loff_t pstart, size_t len) +static int erofs_fscache_read_io_async(struct fscache_cookie *cookie, + loff_t pstart, struct erofs_fscache_io *io) { enum netfs_io_source source; - struct super_block *sb = req->mapping->host->i_sb; - struct netfs_cache_resources *cres = &req->cache_resources; - struct iov_iter iter; - loff_t lstart = req->start + req->submitted; - size_t done = 0; + struct netfs_cache_resources *cres = &io->cres; + struct iov_iter *iter = &io->iter; int ret; - DBG_BUGON(len > req->len - req->submitted); - ret = fscache_begin_read_operation(cres, cookie); if (ret) return ret; - while (done < len) { - loff_t sstart = pstart + done; - size_t slen = len - done; + while (iov_iter_count(iter)) { + size_t orig_count = iov_iter_count(iter), len = orig_count; unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; source = cres->ops->prepare_ondemand_read(cres, - sstart, &slen, LLONG_MAX, &flags, 0); - if (WARN_ON(slen == 0)) + pstart, &len, LLONG_MAX, &flags, 0); + if (WARN_ON(len == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + erofs_err(NULL, "prepare_ondemand_read failed (source %d)", source); return -EIO; } - refcount_inc(&req->ref); - iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages, - lstart + done, slen); - - ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, - erofs_fscache_subreq_complete, req); + iov_iter_truncate(iter, len); + refcount_inc(&io->ref); + ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL, + io->end_io, io); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { - erofs_err(sb, "failed to fscache_read (ret %d)", ret); + erofs_err(NULL, "fscache_read failed (ret %d)", ret); return ret; } + if (WARN_ON(iov_iter_count(iter))) + return -EIO; - done += slen; + iov_iter_reexpand(iter, orig_count - len); + pstart += len; } - DBG_BUGON(done != len); return 0; } -static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +struct erofs_fscache_bio { + struct erofs_fscache_io io; + struct bio bio; /* w/o bdev to share bio_add_page/endio() */ + struct bio_vec bvecs[BIO_MAX_VECS]; +}; + +static void erofs_fscache_bio_endio(void *priv, + ssize_t transferred_or_error, bool was_async) +{ + struct erofs_fscache_bio *io = priv; + + if (IS_ERR_VALUE(transferred_or_error)) + io->bio.bi_status = errno_to_blk_status(transferred_or_error); + io->bio.bi_end_io(&io->bio); + BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0); + erofs_fscache_io_put(&io->io); +} + +struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { + struct erofs_fscache_bio *io; + + io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); + bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); + io->io.private = mdev->m_dif->fscache->cookie; + io->io.end_io = erofs_fscache_bio_endio; + refcount_set(&io->io.ref, 1); + return &io->bio; +} + +void erofs_fscache_submit_bio(struct bio *bio) +{ + struct erofs_fscache_bio *io = container_of(bio, + struct erofs_fscache_bio, bio); int ret; - struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; - struct erofs_fscache_request *req; - req = erofs_fscache_req_alloc(folio_mapping(folio), + iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt, + bio->bi_iter.bi_size); + ret = erofs_fscache_read_io_async(io->io.private, + bio->bi_iter.bi_sector << 9, &io->io); + erofs_fscache_io_put(&io->io); + if (!ret) + return; + bio->bi_status = errno_to_blk_status(ret); + bio->bi_end_io(bio); +} + +static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +{ + struct erofs_fscache *ctx = folio->mapping->host->i_private; + int ret = -ENOMEM; + struct erofs_fscache_rq *req; + struct erofs_fscache_io *io; + + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); - if (IS_ERR(req)) { + if (!req) { folio_unlock(folio); - return PTR_ERR(req); + return ret; } - ret = erofs_fscache_read_folios_async(ctx->cookie, req, - folio_pos(folio), folio_size(folio)); + io = erofs_fscache_req_io_alloc(req); + if (!io) { + req->error = ret; + goto out; + } + iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages, + folio_pos(folio), folio_size(folio)); + + ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io); if (ret) req->error = ret; + erofs_fscache_req_io_put(io); +out: erofs_fscache_req_put(req); return ret; } -static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) +static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) { - struct address_space *mapping = primary->mapping; + struct address_space *mapping = req->mapping; struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct erofs_fscache_request *req; + struct erofs_fscache_io *io; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct iov_iter iter; - loff_t pos = primary->start + primary->submitted; + loff_t pos = req->start + req->submitted; size_t count; int ret; @@ -217,35 +272,32 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) if (map.m_flags & EROFS_MAP_META) { struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - erofs_blk_t blknr; - size_t offset, size; + struct iov_iter iter; + size_t size = map.m_llen; void *src; - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(sb, map.m_pa); - blknr = erofs_blknr(sb, map.m_pa); - size = map.m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP); if (IS_ERR(src)) return PTR_ERR(src); iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); - if (copy_to_iter(src + offset, size, &iter) != size) { + if (copy_to_iter(src, size, &iter) != size) { erofs_put_metabuf(&buf); return -EFAULT; } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); - primary->submitted += PAGE_SIZE; + req->submitted += PAGE_SIZE; return 0; } - count = primary->len - primary->submitted; + count = req->len - req->submitted; if (!(map.m_flags & EROFS_MAP_MAPPED)) { + struct iov_iter iter; + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); - primary->submitted += count; + req->submitted += count; return 0; } @@ -260,18 +312,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) if (ret) return ret; - req = erofs_fscache_req_chain(primary, count); - if (IS_ERR(req)) - return PTR_ERR(req); + io = erofs_fscache_req_io_alloc(req); + if (!io) + return -ENOMEM; + iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); + ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie, + mdev.m_pa + (pos - map.m_la), io); + erofs_fscache_req_io_put(io); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa + (pos - map.m_la), count); - erofs_fscache_req_put(req); - primary->submitted += count; + req->submitted += count; return ret; } -static int erofs_fscache_data_read(struct erofs_fscache_request *req) +static int erofs_fscache_data_read(struct erofs_fscache_rq *req) { int ret; @@ -280,20 +333,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req) if (ret) req->error = ret; } while (!ret && req->submitted < req->len); - return ret; } static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - struct erofs_fscache_request *req; + struct erofs_fscache_rq *req; int ret; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); - if (IS_ERR(req)) { + if (!req) { folio_unlock(folio); - return PTR_ERR(req); + return -ENOMEM; } ret = erofs_fscache_data_read(req); @@ -303,14 +355,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) static void erofs_fscache_readahead(struct readahead_control *rac) { - struct erofs_fscache_request *req; + struct erofs_fscache_rq *req; if (!readahead_count(rac)) return; req = erofs_fscache_req_alloc(rac->mapping, readahead_pos(rac), readahead_length(rac)); - if (IS_ERR(req)) + if (!req) return; /* The request completion will drop refs on the folios. */ @@ -473,7 +525,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; @@ -605,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) if (IS_ERR(fscache)) return PTR_ERR(fscache); - sbi->s_fscache = fscache; + sbi->dif0.fscache = fscache; return 0; } @@ -613,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - erofs_fscache_unregister_cookie(sbi->s_fscache); + erofs_fscache_unregister_cookie(sbi->dif0.fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); - sbi->s_fscache = NULL; + sbi->dif0.fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 9e40bee3682f..db29190656eb 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,39 +5,54 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" - #include <trace/events/erofs.h> -static void *erofs_read_inode(struct erofs_buf *buf, - struct inode *inode, unsigned int *ofs) +static int erofs_fill_symlink(struct inode *inode, void *kaddr, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + loff_t off; + + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) + return 0; + + inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); + return inode->i_link ? 0 : -ENOMEM; +} + +static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); const erofs_off_t inode_loc = erofs_iloc(inode); - erofs_blk_t blkaddr, nblks = 0; void *kaddr; struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; - unsigned int ifmt; - int err; + union erofs_inode_i_u iu; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int ifmt, ofs; + int err = 0; blkaddr = erofs_blknr(sb, inode_loc); - *ofs = erofs_blkoff(sb, inode_loc); + ofs = erofs_blkoff(sb, inode_loc); - kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", vi->nid, PTR_ERR(kaddr)); - return kaddr; + return PTR_ERR(kaddr); } - dic = kaddr + *ofs; + dic = kaddr + ofs; ifmt = le16_to_cpu(dic->i_format); - if (ifmt & ~EROFS_I_ALL) { - erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + erofs_err(sb, "unsupported i_format %u of nid %llu", ifmt, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -45,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { - erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + erofs_err(sb, "unsupported datalayout %u of nid %llu", vi->datalayout, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -55,119 +70,105 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= sb->s_blocksize) { - *ofs += vi->inode_isize; + if (ofs + vi->inode_isize <= sb->s_blocksize) { + ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = sb->s_blocksize - *ofs; + const unsigned int gotten = sb->s_blocksize - ofs; - copied = kmalloc(vi->inode_isize, GFP_NOFS); + copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { err = -ENOMEM; goto err_out; } memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1, + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", vi->nid, PTR_ERR(kaddr)); kfree(copied); - return kaddr; + return PTR_ERR(kaddr); } - *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, *ofs); + ofs = vi->inode_isize - gotten; + memcpy((u8 *)copied + gotten, kaddr, ofs); die = copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(die->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + iu = die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - - /* extended inode has its own timestamp */ + /* each extended inode has its own timestamp */ inode_set_ctime(inode, le64_to_cpu(die->i_mtime), le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(die->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(die->i_u.c.format); kfree(copied); - copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); - *ofs += vi->inode_isize; + ofs += vi->inode_isize; vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(dic->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + iu = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); inode->i_size = le32_to_cpu(dic->i_size); - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(dic->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: - erofs_err(inode->i_sb, - "unsupported on-disk inode version %u of nid %llu", + erofs_err(sb, "unsupported on-disk inode version %u of nid %llu", erofs_inode_version(ifmt), vi->nid); err = -EOPNOTSUPP; goto err_out; } - if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (unlikely(inode->i_size < 0)) { + erofs_err(sb, "negative i_size @ nid %llu", vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + if(S_ISLNK(inode->i_mode)) { + err = erofs_fill_symlink(inode, kaddr, ofs); + if (err) + goto err_out; + } + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode, + vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } + + /* total blocks for compressed files */ + if (erofs_inode_is_data_compressed(vi->datalayout)) { + nblks = le32_to_cpu(iu.compressed_blocks); + } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(iu.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { - erofs_err(inode->i_sb, - "unsupported chunk format %x of nid %llu", + erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -175,7 +176,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode->i_mtime = inode->i_atime = inode_get_ctime(inode); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_get_ctime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && @@ -188,61 +190,23 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); - return kaddr; - -bogusimode: - erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", - inode->i_mode, vi->nid); - err = -EFSCORRUPTED; err_out: - DBG_BUGON(1); - kfree(copied); - erofs_put_metabuf(buf); - return ERR_PTR(err); -} - -static int erofs_fill_symlink(struct inode *inode, void *kaddr, - unsigned int m_pofs) -{ - struct erofs_inode *vi = EROFS_I(inode); - loff_t off; - char *lnk; - - m_pofs += vi->xattr_isize; - /* check if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 || - check_add_overflow(m_pofs, inode->i_size, &off) || - off > i_blocksize(inode)) { - inode->i_op = &erofs_symlink_iops; - return 0; - } - - lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); - if (!lnk) - return -ENOMEM; - - memcpy(lnk, kaddr + m_pofs, inode->i_size); - lnk[inode->i_size] = '\0'; - - inode->i_link = lnk; - inode->i_op = &erofs_fast_symlink_iops; - return 0; + DBG_BUGON(err); + erofs_put_metabuf(&buf); + return err; } static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - unsigned int ofs; - int err = 0; + int err; trace_erofs_fill_inode(inode); /* read inode base data from disk */ - kaddr = erofs_read_inode(&buf, inode, &ofs); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); + err = erofs_read_inode(inode); + if (err) + return err; /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -259,9 +223,10 @@ static int erofs_fill_inode(struct inode *inode) inode_nohighmem(inode); break; case S_IFLNK: - err = erofs_fill_symlink(inode, kaddr, ofs); - if (err) - goto out_unlock; + if (inode->i_link) + inode->i_op = &erofs_fast_symlink_iops; + else + inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; case S_IFCHR: @@ -270,33 +235,33 @@ static int erofs_fill_inode(struct inode *inode) case S_IFSOCK: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); - goto out_unlock; + return 0; default: - err = -EFSCORRUPTED; - goto out_unlock; + return -EFSCORRUPTED; } + mapping_set_large_folios(inode->i_mapping); if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP - if (!erofs_is_fscache_mode(inode->i_sb) && - inode->i_sb->s_blocksize_bits == PAGE_SHIFT) { - inode->i_mapping->a_ops = &z_erofs_aops; - err = 0; - goto out_unlock; - } -#endif + DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); + inode->i_mapping->a_ops = &z_erofs_aops; +#else err = -EOPNOTSUPP; - goto out_unlock; - } - inode->i_mapping->a_ops = &erofs_raw_access_aops; - mapping_set_large_folios(inode->i_mapping); +#endif + } else { + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; #endif + } -out_unlock: - erofs_put_metabuf(&buf); return err; } @@ -353,14 +318,29 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); + bool compressed = + erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout); - if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) + if (compressed) stat->attributes |= STATX_ATTR_COMPRESSED; - stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); + /* + * Return the DIO alignment restrictions if requested. + * + * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and + * compressed files, so in these cases we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN; + if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) { + stat->dio_mem_align = + bdev_logical_block_size(inode->i_sb->s_bdev); + stat->dio_offset_align = stat->dio_mem_align; + } + } generic_fillattr(idmap, request_mask, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 787cc9ff9029..856463a702b2 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -8,8 +8,10 @@ #define __EROFS_INTERNAL_H #include <linux/fs.h> +#include <linux/dax.h> #include <linux/dcache.h> #include <linux/mm.h> +#include <linux/module.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/magic.h> @@ -47,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct bdev_handle *bdev_handle; + struct file *file; struct dax_device *dax_dev; u64 dax_part_off; @@ -62,15 +64,12 @@ enum { }; struct erofs_mount_opts { -#ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ unsigned int sync_decompress; - /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; -#endif unsigned int mount_opt; }; @@ -114,6 +113,7 @@ struct erofs_xattr_prefix_item { }; struct erofs_sb_info { + struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -133,10 +133,7 @@ struct erofs_sb_info { #endif /* CONFIG_EROFS_FS_ZIP */ struct inode *packed_inode; struct erofs_dev_context *devs; - struct dax_device *dax_dev; - u64 dax_part_off; u64 total_blocks; - u32 primarydevice_blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -172,7 +169,6 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; - struct erofs_fscache *s_fscache; struct erofs_domain *domain; char *fsid; char *domain_id; @@ -186,14 +182,21 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) #define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { - return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && + !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev; } enum { @@ -202,28 +205,21 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -/* basic unit of the workstation of a super_block */ -struct erofs_workgroup { - pgoff_t index; - struct lockref lockref; -}; - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { - struct inode *inode; + struct address_space *mapping; + struct file *file; struct page *page; void *base; enum erofs_kmap_type kmap_type; }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define ROOT_NID(sb) ((sb)->root_nid) - -#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) #define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) @@ -281,13 +277,8 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - union { - struct { - erofs_off_t z_idataoff; - unsigned short z_idata_size; - }; - erofs_off_t z_fragmentoff; - }; + erofs_off_t z_fragmentoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -315,17 +306,13 @@ static inline unsigned int erofs_inode_datalayout(unsigned int ifmt) return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK; } -/* - * Different from grab_cache_page_nowait(), reclaiming is never triggered - * when allocating new pages. - */ -static inline -struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, - pgoff_t index) +/* reclaiming is never triggered when allocating new folios. */ +static inline struct folio *erofs_grab_folio_nowait(struct address_space *as, + pgoff_t index) { - return pagecache_get_page(mapping, index, + return __filemap_get_folio(as, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, - readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); + readahead_gfp_mask(as) & ~__GFP_RECLAIM); } /* Has a disk mapping */ @@ -337,10 +324,12 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT 0x0010 +#define __EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ #define EROFS_MAP_PARTIAL_REF 0x0020 +#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT) + struct erofs_map_blocks { struct erofs_buf buf; @@ -369,10 +358,9 @@ enum { }; struct erofs_map_dev { - struct erofs_fscache *m_fscache; + struct super_block *m_sb; + struct erofs_device_info *m_dif; struct block_device *m_bdev; - struct dax_device *m_daxdev; - u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; @@ -380,7 +368,8 @@ struct erofs_map_dev { extern const struct super_operations erofs_sops; -extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; extern const struct address_space_operations z_erofs_aops; extern const struct address_space_operations erofs_fscache_access_aops; @@ -402,15 +391,18 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type); void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type); + erofs_off_t offset, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); +void erofs_onlinefolio_init(struct folio *folio); +void erofs_onlinefolio_split(struct folio *folio); +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, @@ -438,7 +430,11 @@ void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv); +static inline struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) +{ + return __erofs_allocpage(pagepool, gfp, false); +} static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); @@ -447,56 +443,41 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -void erofs_workgroup_put(struct erofs_workgroup *grp); -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index); -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp); -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +extern atomic_long_t erofs_global_shrink_cnt; void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); -int __init z_erofs_init_zip_subsystem(void); -void z_erofs_exit_zip_subsystem(void); -int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, - struct erofs_workgroup *egrp); +int __init z_erofs_init_subsystem(void); +void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); -void *erofs_get_pcpubuf(unsigned int requiredpages); -void erofs_put_pcpubuf(void *ptr); -int erofs_pcpubuf_growsize(unsigned int nrpages); -void __init erofs_pcpubuf_init(void); -void erofs_pcpubuf_exit(void); -int erofs_init_managed_cache(struct super_block *sb); +void *z_erofs_get_gbuf(unsigned int requiredpages); +void z_erofs_put_gbuf(void *ptr); +int z_erofs_gbuf_growsize(unsigned int nrpages); +int __init z_erofs_gbuf_init(void); +void z_erofs_gbuf_exit(void); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} -static inline int z_erofs_init_zip_subsystem(void) { return 0; } -static inline void z_erofs_exit_zip_subsystem(void) {} -static inline void erofs_pcpubuf_init(void) {} -static inline void erofs_pcpubuf_exit(void) {} -static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } +static inline int z_erofs_init_subsystem(void) { return 0; } +static inline void z_erofs_exit_subsystem(void) {} +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ -#ifdef CONFIG_EROFS_FS_ZIP_LZMA -int __init z_erofs_lzma_init(void); -void z_erofs_lzma_exit(void); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fileio_submit_bio(struct bio *bio); #else -static inline int z_erofs_lzma_init(void) { return 0; } -static inline int z_erofs_lzma_exit(void) { return 0; } -#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ - -#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE -int __init z_erofs_deflate_init(void); -void z_erofs_deflate_exit(void); -#else -static inline int z_erofs_deflate_init(void) { return 0; } -static inline int z_erofs_deflate_exit(void) { return 0; } -#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ +static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fileio_submit_bio(struct bio *bio) {} +#endif #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); @@ -505,6 +486,8 @@ void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); +struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fscache_submit_bio(struct bio *bio); #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -522,6 +505,8 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } +static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index f0110a78acb2..c94d0c1608a8 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - buf.inode = dir; - de = erofs_bread(&buf, mid, EROFS_KMAP); + buf.mapping = dir->i_mapping; + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); @@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.name = name->name; qn.end = name->name + name->len; - buf.inode = dir; + buf.mapping = dir->i_mapping; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c deleted file mode 100644 index c7a4b1d77069..000000000000 --- a/fs/erofs/pcpubuf.c +++ /dev/null @@ -1,148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) Gao Xiang <xiang@kernel.org> - * - * For low-latency decompression algorithms (e.g. lz4), reserve consecutive - * per-CPU virtual memory (in pages) in advance to store such inplace I/O - * data if inplace decompression is failed (due to unmet inplace margin for - * example). - */ -#include "internal.h" - -struct erofs_pcpubuf { - raw_spinlock_t lock; - void *ptr; - struct page **pages; - unsigned int nrpages; -}; - -static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); - -void *erofs_get_pcpubuf(unsigned int requiredpages) - __acquires(pcb->lock) -{ - struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); - - raw_spin_lock(&pcb->lock); - /* check if the per-CPU buffer is too small */ - if (requiredpages > pcb->nrpages) { - raw_spin_unlock(&pcb->lock); - put_cpu_var(erofs_pcb); - /* (for sparse checker) pretend pcb->lock is still taken */ - __acquire(pcb->lock); - return NULL; - } - return pcb->ptr; -} - -void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) -{ - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); - - DBG_BUGON(pcb->ptr != ptr); - raw_spin_unlock(&pcb->lock); - put_cpu_var(erofs_pcb); -} - -/* the next step: support per-CPU page buffers hotplug */ -int erofs_pcpubuf_growsize(unsigned int nrpages) -{ - static DEFINE_MUTEX(pcb_resize_mutex); - static unsigned int pcb_nrpages; - struct page *pagepool = NULL; - int delta, cpu, ret, i; - - mutex_lock(&pcb_resize_mutex); - delta = nrpages - pcb_nrpages; - ret = 0; - /* avoid shrinking pcpubuf, since no idea how many fses rely on */ - if (delta <= 0) - goto out; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - struct page **pages, **oldpages; - void *ptr, *old_ptr; - - pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - break; - } - - for (i = 0; i < nrpages; ++i) { - pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); - if (!pages[i]) { - ret = -ENOMEM; - oldpages = pages; - goto free_pagearray; - } - } - ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); - if (!ptr) { - ret = -ENOMEM; - oldpages = pages; - goto free_pagearray; - } - raw_spin_lock(&pcb->lock); - old_ptr = pcb->ptr; - pcb->ptr = ptr; - oldpages = pcb->pages; - pcb->pages = pages; - i = pcb->nrpages; - pcb->nrpages = nrpages; - raw_spin_unlock(&pcb->lock); - - if (!oldpages) { - DBG_BUGON(old_ptr); - continue; - } - - if (old_ptr) - vunmap(old_ptr); -free_pagearray: - while (i) - erofs_pagepool_add(&pagepool, oldpages[--i]); - kfree(oldpages); - if (ret) - break; - } - pcb_nrpages = nrpages; - erofs_release_pages(&pagepool); -out: - mutex_unlock(&pcb_resize_mutex); - return ret; -} - -void __init erofs_pcpubuf_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - - raw_spin_lock_init(&pcb->lock); - } -} - -void erofs_pcpubuf_exit(void) -{ - int cpu, i; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - - if (pcb->ptr) { - vunmap(pcb->ptr); - pcb->ptr = NULL; - } - if (!pcb->pages) - continue; - - for (i = 0; i < pcb->nrpages; ++i) - if (pcb->pages[i]) - put_page(pcb->pages[i]); - kfree(pcb->pages); - pcb->pages = NULL; - } -} diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 113414e6f35b..5fcdab614517 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -4,15 +4,13 @@ * https://www.huawei.com/ * Copyright (C) 2021, Alibaba Cloud */ -#include <linux/module.h> #include <linux/statfs.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> -#include <linux/dax.h> #include <linux/exportfs.h> +#include <linux/backing-dev.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -30,7 +28,10 @@ void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); + if (sb) + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); + else + pr_err("%s: %pV", func, &vaf); va_end(args); } @@ -44,7 +45,10 @@ void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_info("(device %s): %pV", sb->s_id, &vaf); + if (sb) + pr_info("(device %s): %pV", sb->s_id, &vaf); + else + pr_info("%pV", &vaf); va_end(args); } @@ -105,22 +109,6 @@ static void erofs_free_inode(struct inode *inode) kmem_cache_free(erofs_inode_cachep, vi); } -static bool check_layout_compatibility(struct super_block *sb, - struct erofs_super_block *dsb) -{ - const unsigned int feature = le32_to_cpu(dsb->feature_incompat); - - EROFS_SB(sb)->feature_incompat = feature; - - /* check if current kernel meets all mandatory requirements */ - if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", - feature & ~EROFS_ALL_FEATURE_INCOMPAT); - return false; - } - return true; -} - /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) @@ -129,11 +117,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) return ptr; - len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]); + len = le16_to_cpu(*(__le16 *)ptr); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); @@ -145,12 +133,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); - ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } - memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt); + memcpy(buffer + i, ptr, cnt); *offset += cnt; } return buffer; @@ -174,13 +162,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct bdev_handle *bdev_handle; - void *ptr; + struct file *file; - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - dis = ptr + erofs_blkoff(sb, *pos); + dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); + if (IS_ERR(dis)) + return PTR_ERR(dis); if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { @@ -198,13 +184,24 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ, - sb->s_type, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); - dif->bdev_handle = bdev_handle; - dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, - &dif->dax_part_off, NULL, NULL); + file = erofs_is_fileio_mode(sbi) ? + filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : + bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, NULL); + if (IS_ERR(file)) { + if (file == ERR_PTR(-ENOTBLK)) + return -EINVAL; + return PTR_ERR(file); + } + + if (!erofs_is_fileio_mode(sbi)) { + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), + &dif->dax_part_off, NULL, NULL); + } else if (!S_ISREG(file_inode(file)->i_mode)) { + fput(file); + return -EINVAL; + } + dif->file = file; } dif->blocks = le32_to_cpu(dis->blocks); @@ -224,7 +221,7 @@ static int erofs_scan_devices(struct super_block *sb, struct erofs_device_info *dif; int id, err = 0; - sbi->total_blocks = sbi->primarydevice_blocks; + sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else @@ -278,7 +275,7 @@ static int erofs_scan_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { - struct erofs_sb_info *sbi; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; @@ -290,9 +287,7 @@ static int erofs_read_superblock(struct super_block *sb) return PTR_ERR(data); } - sbi = EROFS_SB(sb); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); - ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); @@ -317,8 +312,12 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - if (!check_layout_compatibility(sb, dsb)) + sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); + if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; + } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { @@ -326,7 +325,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -342,7 +341,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); ret = strscpy(sbi->volume_name, dsb->volume_name, sizeof(dsb->volume_name)); @@ -361,7 +360,7 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_scan_devices(sb, dsb); if (erofs_is_fscache_mode(sb)) - erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -383,14 +382,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi) } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_err }; @@ -417,6 +410,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), {} }; @@ -427,7 +421,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: - warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); set_opt(&sbi->opt, DAX_ALWAYS); clear_opt(&sbi->opt, DAX_NEVER); return true; @@ -531,30 +524,68 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } return 0; } -static struct inode *erofs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { - return erofs_iget(sb, ino); + erofs_nid_t nid = EROFS_I(inode)->nid; + int len = parent ? 6 : 3; + + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + + fh[0] = (u32)(nid >> 32); + fh[1] = (u32)(nid & 0xffffffff); + fh[2] = inode->i_generation; + + if (parent) { + nid = EROFS_I(parent)->nid; + + fh[3] = (u32)(nid >> 32); + fh[4] = (u32)(nid & 0xffffffff); + fh[5] = parent->i_generation; + } + + *max_len = len; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - erofs_nfs_get_inode); + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) + return NULL; + + return d_obtain_alias(erofs_iget(sb, + ((u64)fid->raw[0] << 32) | fid->raw[1])); } static struct dentry *erofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - erofs_nfs_get_inode); + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) + return NULL; + + return d_obtain_alias(erofs_iget(sb, + ((u64)fid->raw[3] << 32) | fid->raw[4])); } static struct dentry *erofs_get_parent(struct dentry *child) @@ -570,11 +601,28 @@ static struct dentry *erofs_get_parent(struct dentry *child) } static const struct export_operations erofs_export_ops = { + .encode_fh = erofs_encode_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, }; +static void erofs_set_sysfs_name(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, + sbi->fsid); + else if (sbi->fsid) + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + else if (erofs_is_fileio_mode(sbi)) + super_set_sysfs_name_generic(sb, "%s", + bdi_dev_name(sb->s_bdi)); + else + super_set_sysfs_name_id(sb); +} + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -587,14 +635,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; - if (erofs_is_fscache_mode(sb)) { + if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - err = erofs_fscache_register_fs(sb); - if (err) - return err; - + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } err = super_setup_bdi(sb); if (err) return err; @@ -604,9 +653,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -EINVAL; } - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off, - NULL, NULL); + sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); @@ -618,14 +666,18 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) errorfc(fc, "unsupported blksize for fscache mode"); return -EINVAL; } - if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + + if (erofs_is_fileio_mode(sbi)) { + sb->s_blocksize = 1 << sbi->blkszbits; + sb->s_blocksize_bits = sbi->blkszbits; + } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { errorfc(fc, "failed to set erofs blksize"); return -EINVAL; } } if (test_opt(&sbi->opt, DAX_ALWAYS)) { - if (!sbi->dax_dev) { + if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { @@ -643,58 +695,73 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) else sb->s_flags &= ~SB_POSIXACL; -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err = z_erofs_init_super(sb); + if (err) + return err; - inode = erofs_iget(sb, ROOT_NID(sbi)); + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode = inode; + } + + inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) return PTR_ERR(inode); if (!S_ISDIR(inode->i_mode)) { erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", - ROOT_NID(sbi), inode->i_mode); + sbi->root_nid, inode->i_mode); iput(inode); return -EINVAL; } - sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); - if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { - sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); - if (IS_ERR(sbi->packed_inode)) { - err = PTR_ERR(sbi->packed_inode); - sbi->packed_inode = NULL; - return err; - } - } - err = erofs_init_managed_cache(sb); - if (err) - return err; - err = erofs_xattr_prefixes_init(sb); if (err) return err; + erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; - erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); + erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev_flags(fc, erofs_fc_fill_super, + IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ? + GET_TREE_BDEV_QUIET_LOOKUP : 0); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + struct file *file; + + if (!fc->source) + return invalf(fc, "No source specified"); + file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + sbi->dif0.file = file; + + if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && + sbi->dif0.file->f_mapping->a_ops->read_folio) + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) @@ -724,8 +791,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_handle) - bdev_release(dif->bdev_handle); + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -742,19 +809,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) kfree(devs); } -static void erofs_fc_free(struct fs_context *fc) +static void erofs_sb_free(struct erofs_sb_info *sbi) { - struct erofs_sb_info *sbi = fc->s_fs_info; - - if (!sbi) - return; - erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->dif0.file) + fput(sbi->dif0.file); kfree(sbi); } +static void erofs_fc_free(struct fs_context *fc) +{ + struct erofs_sb_info *sbi = fc->s_fs_info; + + if (sbi) /* free here if an error occurs before transferring to sb */ + erofs_sb_free(sbi); +} + static const struct fs_context_operations erofs_context_ops = { .parse_param = erofs_fc_parse_param, .get_tree = erofs_fc_get_tree, @@ -784,21 +856,29 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } +static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) +{ + iput(sbi->packed_inode); + sbi->packed_inode = NULL; +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || + sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); - - erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev, NULL); + erofs_drop_internal_inodes(sbi); + fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->fsid); - kfree(sbi->domain_id); - kfree(sbi); + erofs_sb_free(sbi); sb->s_fs_info = NULL; } @@ -806,17 +886,10 @@ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - DBG_BUGON(!sbi); - erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); -#ifdef CONFIG_EROFS_FS_ZIP - iput(sbi->managed_cache); - sbi->managed_cache = NULL; -#endif - iput(sbi->packed_inode); - sbi->packed_inode = NULL; + erofs_drop_internal_inodes(sbi); erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); @@ -839,7 +912,7 @@ static int __init erofs_module_init(void) erofs_inode_cachep = kmem_cache_create("erofs_inode", sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; @@ -848,16 +921,7 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; - err = z_erofs_lzma_init(); - if (err) - goto lzma_err; - - err = z_erofs_deflate_init(); - if (err) - goto deflate_err; - - erofs_pcpubuf_init(); - err = z_erofs_init_zip_subsystem(); + err = z_erofs_init_subsystem(); if (err) goto zip_err; @@ -874,12 +938,8 @@ static int __init erofs_module_init(void) fs_err: erofs_exit_sysfs(); sysfs_err: - z_erofs_exit_zip_subsystem(); + z_erofs_exit_subsystem(); zip_err: - z_erofs_deflate_exit(); -deflate_err: - z_erofs_lzma_exit(); -lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); @@ -894,34 +954,29 @@ static void __exit erofs_module_exit(void) rcu_barrier(); erofs_exit_sysfs(); - z_erofs_exit_zip_subsystem(); - z_erofs_deflate_exit(); - z_erofs_lzma_exit(); + z_erofs_exit_subsystem(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); - erofs_pcpubuf_exit(); } static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - u64 id = 0; - - if (!erofs_is_fscache_mode(sb)) - id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; - buf->f_files = ULLONG_MAX; buf->f_ffree = ULLONG_MAX - sbi->inos; - buf->f_namelen = EROFS_NAME_LEN; - buf->f_fsid = u64_to_fsid(id); + if (uuid_is_null(&sb->s_uuid)) + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : + huge_encode_dev(sb->s_bdev->bd_dev)); + else + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } @@ -930,30 +985,20 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); struct erofs_mount_opts *opt = &sbi->opt; -#ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(opt, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - else - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(opt, POSIX_ACL)) - seq_puts(seq, ",acl"); - else - seq_puts(seq, ",noacl"); -#endif -#ifdef CONFIG_EROFS_FS_ZIP - if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) - seq_puts(seq, ",cache_strategy=disabled"); - else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) - seq_puts(seq, ",cache_strategy=readahead"); - else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) - seq_puts(seq, ",cache_strategy=readaround"); -#endif + if (IS_ENABLED(CONFIG_EROFS_FS_XATTR)) + seq_puts(seq, test_opt(opt, XATTR_USER) ? + ",user_xattr" : ",nouser_xattr"); + if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL)) + seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl"); + if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) + seq_printf(seq, ",cache_strategy=%s", + erofs_param_cache_strategy[opt->cache_strategy].name); if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 435e515c0792..63cffd0fd261 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -205,34 +205,16 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - char *name; - char *str = NULL; int err; - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) { - str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, - sbi->fsid); - if (!str) - return -ENOMEM; - name = str; - } else { - name = sbi->fsid; - } - } else { - name = sb->s_id; - } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); - kfree(str); - if (err) - goto put_sb_kobj; - return 0; - -put_sb_kobj: - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", + sb->s_sysfs_name); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } return err; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c deleted file mode 100644 index 4256a85719a1..000000000000 --- a/fs/erofs/utils.c +++ /dev/null @@ -1,282 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#include "internal.h" - -struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) -{ - struct page *page = *pagepool; - - if (page) { - DBG_BUGON(page_ref_count(page) != 1); - *pagepool = (struct page *)page_private(page); - } else { - page = alloc_page(gfp); - } - return page; -} - -void erofs_release_pages(struct page **pagepool) -{ - while (*pagepool) { - struct page *page = *pagepool; - - *pagepool = (struct page *)page_private(page); - put_page(page); - } -} - -#ifdef CONFIG_EROFS_FS_ZIP -/* global shrink count (for all mounted EROFS instances) */ -static atomic_long_t erofs_global_shrink_cnt; - -static bool erofs_workgroup_get(struct erofs_workgroup *grp) -{ - if (lockref_get_not_zero(&grp->lockref)) - return true; - - spin_lock(&grp->lockref.lock); - if (__lockref_is_dead(&grp->lockref)) { - spin_unlock(&grp->lockref.lock); - return false; - } - - if (!grp->lockref.count++) - atomic_long_dec(&erofs_global_shrink_cnt); - spin_unlock(&grp->lockref.lock); - return true; -} - -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_workgroup *grp; - -repeat: - rcu_read_lock(); - grp = xa_load(&sbi->managed_pslots, index); - if (grp) { - if (!erofs_workgroup_get(grp)) { - /* prefer to relax rcu read side */ - rcu_read_unlock(); - goto repeat; - } - - DBG_BUGON(index != grp->index); - } - rcu_read_unlock(); - return grp; -} - -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct erofs_workgroup *pre; - - DBG_BUGON(grp->lockref.count < 1); -repeat: - xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); - if (pre) { - if (xa_is_err(pre)) { - pre = ERR_PTR(xa_err(pre)); - } else if (!erofs_workgroup_get(pre)) { - /* try to legitimize the current in-tree one */ - xa_unlock(&sbi->managed_pslots); - cond_resched(); - goto repeat; - } - grp = pre; - } - xa_unlock(&sbi->managed_pslots); - return grp; -} - -static void __erofs_workgroup_free(struct erofs_workgroup *grp) -{ - atomic_long_dec(&erofs_global_shrink_cnt); - erofs_workgroup_free_rcu(grp); -} - -void erofs_workgroup_put(struct erofs_workgroup *grp) -{ - if (lockref_put_or_lock(&grp->lockref)) - return; - - DBG_BUGON(__lockref_is_dead(&grp->lockref)); - if (grp->lockref.count == 1) - atomic_long_inc(&erofs_global_shrink_cnt); - --grp->lockref.count; - spin_unlock(&grp->lockref.lock); -} - -static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) -{ - int free = false; - - spin_lock(&grp->lockref.lock); - if (grp->lockref.count) - goto out; - - /* - * Note that all cached pages should be detached before deleted from - * the XArray. Otherwise some cached pages could be still attached to - * the orphan old workgroup when the new one is available in the tree. - */ - if (erofs_try_to_free_all_cached_pages(sbi, grp)) - goto out; - - /* - * It's impossible to fail after the workgroup is freezed, - * however in order to avoid some race conditions, add a - * DBG_BUGON to observe this in advance. - */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - - lockref_mark_dead(&grp->lockref); - free = true; -out: - spin_unlock(&grp->lockref.lock); - if (free) - __erofs_workgroup_free(grp); - return free; -} - -static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink) -{ - struct erofs_workgroup *grp; - unsigned int freed = 0; - unsigned long index; - - xa_lock(&sbi->managed_pslots); - xa_for_each(&sbi->managed_pslots, index, grp) { - /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp)) - continue; - xa_unlock(&sbi->managed_pslots); - - ++freed; - if (!--nr_shrink) - return freed; - xa_lock(&sbi->managed_pslots); - } - xa_unlock(&sbi->managed_pslots); - return freed; -} - -/* protected by 'erofs_sb_list_lock' */ -static unsigned int shrinker_run_no; - -/* protects the mounted 'erofs_sb_list' */ -static DEFINE_SPINLOCK(erofs_sb_list_lock); -static LIST_HEAD(erofs_sb_list); - -void erofs_shrinker_register(struct super_block *sb) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - - mutex_init(&sbi->umount_mutex); - - spin_lock(&erofs_sb_list_lock); - list_add(&sbi->list, &erofs_sb_list); - spin_unlock(&erofs_sb_list_lock); -} - -void erofs_shrinker_unregister(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - - mutex_lock(&sbi->umount_mutex); - /* clean up all remaining workgroups in memory */ - erofs_shrink_workstation(sbi, ~0UL); - - spin_lock(&erofs_sb_list_lock); - list_del(&sbi->list); - spin_unlock(&erofs_sb_list_lock); - mutex_unlock(&sbi->umount_mutex); -} - -static unsigned long erofs_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return atomic_long_read(&erofs_global_shrink_cnt); -} - -static unsigned long erofs_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct erofs_sb_info *sbi; - struct list_head *p; - - unsigned long nr = sc->nr_to_scan; - unsigned int run_no; - unsigned long freed = 0; - - spin_lock(&erofs_sb_list_lock); - do { - run_no = ++shrinker_run_no; - } while (run_no == 0); - - /* Iterate over all mounted superblocks and try to shrink them */ - p = erofs_sb_list.next; - while (p != &erofs_sb_list) { - sbi = list_entry(p, struct erofs_sb_info, list); - - /* - * We move the ones we do to the end of the list, so we stop - * when we see one we have already done. - */ - if (sbi->shrinker_run_no == run_no) - break; - - if (!mutex_trylock(&sbi->umount_mutex)) { - p = p->next; - continue; - } - - spin_unlock(&erofs_sb_list_lock); - sbi->shrinker_run_no = run_no; - - freed += erofs_shrink_workstation(sbi, nr - freed); - - spin_lock(&erofs_sb_list_lock); - /* Get the next list element before we move this one */ - p = p->next; - - /* - * Move this one to the end of the list to provide some - * fairness. - */ - list_move_tail(&sbi->list, &erofs_sb_list); - mutex_unlock(&sbi->umount_mutex); - - if (freed >= nr) - break; - } - spin_unlock(&erofs_sb_list_lock); - return freed; -} - -static struct shrinker erofs_shrinker_info = { - .scan_objects = erofs_shrink_scan, - .count_objects = erofs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; - -int __init erofs_init_shrinker(void) -{ - return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); -} - -void erofs_exit_shrinker(void) -{ - unregister_shrinker(&erofs_shrinker_info); -} -#endif /* !CONFIG_EROFS_FS_ZIP */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 09d341675e89..60d2cf26e837 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = it.kaddr + erofs_blkoff(sb, it.pos); + ih = it.kaddr; vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, @@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), - EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; ret = PTR_ERR(it.kaddr); goto out_unlock; } - vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) - (it.kaddr + erofs_blkoff(sb, it.pos))); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr); it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -168,7 +166,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { }; #endif -const struct xattr_handler *erofs_xattr_handlers[] = { +const struct xattr_handler * const erofs_xattr_handlers[] = { &erofs_xattr_user_handler, &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY @@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, void *src; for (processed = 0; processed < len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - src = it->kaddr + erofs_blkoff(sb, it->pos); + src = it->kaddr; slice = min_t(unsigned int, sb->s_blocksize - erofs_blkoff(sb, it->pos), len - processed); memcpy(it->buffer + it->buffer_ofs, src, slice); @@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) int err; /* 1. handle xattr entry */ - entry = *(struct erofs_xattr_entry *) - (it->kaddr + erofs_blkoff(it->sb, it->pos)); + entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); base_index = entry.e_name_index; @@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) unsigned int slice, processed, value_sz; /* 1. handle xattr entry */ - entry = *(struct erofs_xattr_entry *) - (it->kaddr + erofs_blkoff(sb, it->pos)); + entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); value_sz = le16_to_cpu(entry.e_value_size); @@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) sb->s_blocksize - erofs_blkoff(sb, it->pos), entry.e_name_len - processed); if (memcmp(it->name.name + it->infix_len + processed, - it->kaddr + erofs_blkoff(sb, it->pos), slice)) + it->kaddr, slice)) return -ENOATTR; it->pos += slice; } @@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { - it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - entry_sz = erofs_xattr_entry_size(it->kaddr + - erofs_blkoff(it->sb, it->pos)); + entry_sz = erofs_xattr_entry_size(it->kaddr); /* xattr on-disk corruption: xattr entry beyond xattr_isize */ if (remaining < entry_sz) { DBG_BUGON(1); @@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, for (i = 0; i < vi->xattr_shared_count; ++i) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i] * sizeof(__le32); - it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), - EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -416,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, } it.index = index; - it.name = (struct qstr)QSTR_INIT(name, strlen(name)); + it.name = QSTR(name); if (it.name.len > EROFS_NAME_LEN) return -ERANGE; @@ -492,7 +483,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) return -ENOMEM; if (sbi->packed_inode) - buf.inode = sbi->packed_inode; + buf.mapping = sbi->packed_inode->i_mapping; else erofs_init_metabuf(&buf, sb); diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index f16283cb8c93..b246cd0e135e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx, { const struct xattr_handler *handler = NULL; - static const struct xattr_handler *xattr_handler_map[] = { + static const struct xattr_handler * const xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx, return xattr_prefix(handler); } -extern const struct xattr_handler *erofs_xattr_handlers[]; +extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); void erofs_xattr_prefixes_cleanup(struct super_block *sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 1c0e6167d8e7..f35d2eb0ed11 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -12,12 +12,6 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - struct z_erofs_bvec { struct page *page; int offset; @@ -44,11 +38,14 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); * A: Field should be accessed / updated in atomic for parallelized code. */ struct z_erofs_pcluster { - struct erofs_workgroup obj; struct mutex lock; + struct lockref lockref; /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; + struct z_erofs_pcluster *next; + + /* I: start block address of this pcluster */ + erofs_off_t index; /* L: the maximum decompression size of this round */ unsigned int length; @@ -56,6 +53,9 @@ struct z_erofs_pcluster { /* L: total number of bvecs */ unsigned int vcnt; + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; @@ -70,22 +70,14 @@ struct z_erofs_pcluster { struct rcu_head rcu; }; - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - /* I: compression algorithm format */ unsigned char algorithmformat; /* L: whether partial decompression or not */ bool partial; - /* L: indicate several pageofs_outs or not */ - bool multibases; + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; @@ -93,12 +85,11 @@ struct z_erofs_pcluster { /* the end of a chain of pclusters */ #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) -#define Z_EROFS_PCLUSTER_NIL (NULL) struct z_erofs_decompressqueue { struct super_block *sb; + struct z_erofs_pcluster *head; atomic_t pending_bios; - z_erofs_next_pcluster_t head; union { struct completion done; @@ -110,57 +101,18 @@ struct z_erofs_decompressqueue { static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) { - return !pcl->obj.index; + return !pcl->index; } static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; + return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) { - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static void z_erofs_onlinepage_endio(struct page *page, int err) -{ - int orig, v; - - DBG_BUGON(!PagePrivate(page)); - - do { - orig = atomic_read((atomic_t *)&page->private); - v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } + return fo->mapping == MNGD_MAPPING(sbi); } #define Z_EROFS_ONSTACK_PAGES 32 @@ -237,7 +189,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_NOFS); + nextpage = __erofs_allocpage(pagepool, GFP_KERNEL, + true); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -298,21 +251,21 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; + pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -451,81 +404,81 @@ static inline int erofs_cpu_hotplug_init(void) { return 0; } static inline void erofs_cpu_hotplug_destroy(void) {} #endif -void z_erofs_exit_zip_subsystem(void) +void z_erofs_exit_subsystem(void) { erofs_cpu_hotplug_destroy(); erofs_destroy_percpu_workers(); destroy_workqueue(z_erofs_workqueue); z_erofs_destroy_pcluster_pool(); + z_erofs_exit_decompressor(); } -int __init z_erofs_init_zip_subsystem(void) +int __init z_erofs_init_subsystem(void) { - int err = z_erofs_create_pcluster_pool(); + int err = z_erofs_init_decompressor(); if (err) - goto out_error_pcluster_pool; + goto err_decompressor; + + err = z_erofs_create_pcluster_pool(); + if (err) + goto err_pcluster_pool; z_erofs_workqueue = alloc_workqueue("erofs_worker", WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); if (!z_erofs_workqueue) { err = -ENOMEM; - goto out_error_workqueue_init; + goto err_workqueue_init; } err = erofs_init_percpu_workers(); if (err) - goto out_error_pcpu_worker; + goto err_pcpu_worker; err = erofs_cpu_hotplug_init(); if (err < 0) - goto out_error_cpuhp_init; + goto err_cpuhp_init; return err; -out_error_cpuhp_init: +err_cpuhp_init: erofs_destroy_percpu_workers(); -out_error_pcpu_worker: +err_pcpu_worker: destroy_workqueue(z_erofs_workqueue); -out_error_workqueue_init: +err_workqueue_init: z_erofs_destroy_pcluster_pool(); -out_error_pcluster_pool: +err_pcluster_pool: + z_erofs_exit_decompressor(); +err_decompressor: return err; } enum z_erofs_pclustermode { + /* It has previously been linked into another processing chain */ Z_EROFS_PCLUSTER_INFLIGHT, /* - * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it - * could be dispatched into bypass queue later due to uptodated managed - * pages. All related online pages cannot be reused for inplace I/O (or - * bvpage) since it can be directly decoded without I/O submission. + * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it + * may be dispatched to the bypass queue later due to uptodated managed + * folios. All file-backed folios related to this pcluster cannot be + * reused for in-place I/O (or bvpage) since the pcluster may be decoded + * in a separate queue (and thus out of order). */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The pcluster was just linked to a decompression chain by us. It can - * also be linked with the remaining pclusters, which means if the - * processing page is the tail page of a pcluster, this pcluster can - * safely use the whole page (since the previous pcluster is within the - * same chain) for in-place I/O, as illustrated below: - * ___________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current pcl) | (of the previous pcl) | - * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| - * - * [ (*) the page above can be used as inplace I/O. ] + * The pcluster has just been linked to our processing chain. + * File-backed folios (except for the head page) related to it can be + * used for in-place I/O (or bvpage). */ Z_EROFS_PCLUSTER_FOLLOWED, }; -struct z_erofs_decompress_frontend { +struct z_erofs_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; struct page *pagepool; struct page *candidate_bvpage; - struct z_erofs_pcluster *pcl; - z_erofs_next_pcluster_t owned_head; + struct z_erofs_pcluster *pcl, *head; enum z_erofs_pclustermode mode; erofs_off_t headoffset; @@ -534,11 +487,11 @@ struct z_erofs_decompress_frontend { unsigned int icur; }; -#define DECOMPRESS_FRONTEND_INIT(__i) { \ - .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED } +#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \ + .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho } -static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe) { unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; @@ -555,10 +508,11 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) +static void z_erofs_bind_cache(struct z_erofs_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* @@ -569,42 +523,40 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; - void *t; /* mark pages just found for debugging */ - struct page *newpage = NULL; + for (i = 0; i < pclusterpages; ++i) { + struct page *page, *newpage; - /* the compressed page was loaded before */ + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, pcl->obj.index + i); - - if (page) { - t = (void *)((unsigned long)page | 1); - } else { + page = find_get_page(mc, pcl->index + i); + if (!page) { /* I/O is needed, no possible to decompress directly */ standalone = false; if (!shouldalloc) continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Try cached I/O if allocation succeeds or fallback to + * in-place I/O instead to avoid any direct reclaim. */ newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = (void *)((unsigned long)newpage | 1); } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + spin_lock(&pcl->lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = page ? page : newpage; + spin_unlock(&pcl->lockref.lock); continue; + } + spin_unlock(&pcl->lockref.lock); if (page) put_page(page); @@ -620,36 +572,29 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } -/* called by erofs_shrinker to get rid of all compressed_pages */ -int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) +/* (erofs_shrinker) disconnect cached encoded data with pclusters */ +static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + struct folio *folio; int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - /* - * refcount of workgroup is now freezed as 0, - * therefore no need to worry about available decompression users. - */ - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_bvecs[i].page; - - if (!page) - continue; - - /* block other users from reclaiming or migrating the page */ - if (!trylock_page(page)) - return -EBUSY; - - if (!erofs_page_is_managed(sbi, page)) - continue; + /* Each cached folio contains one page unless bs > ps is supported */ + for (i = 0; i < pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page) { + folio = page_folio(pcl->compressed_bvecs[i].page); + /* Avoid reclaiming or migrating this folio */ + if (!folio_trylock(folio)) + return -EBUSY; - /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - detach_page_private(page); - unlock_page(page); + if (!erofs_folio_is_managed(sbi, folio)) + continue; + pcl->compressed_bvecs[i].page = NULL; + folio_detach_private(folio); + folio_unlock(folio); + } } return 0; } @@ -657,29 +602,27 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); + struct z_erofs_bvec *bvec = pcl->compressed_bvecs; + struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl); bool ret; - int i; if (!folio_test_private(folio)) return true; ret = false; - spin_lock(&pcl->obj.lockref.lock); - if (pcl->obj.lockref.count > 0) - goto out; - - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == &folio->page) { - WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = true; - break; + spin_lock(&pcl->lockref.lock); + if (pcl->lockref.count <= 0) { + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (; bvec < end; ++bvec) { + if (bvec->page && page_folio(bvec->page) == folio) { + bvec->page = NULL; + folio_detach_private(folio); + ret = true; + break; + } } } - if (ret) - folio_detach_private(folio); -out: - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return ret; } @@ -697,7 +640,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio, DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) - while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } @@ -706,46 +649,40 @@ static const struct address_space_operations z_erofs_cache_aops = { .invalidate_folio = z_erofs_cache_invalidate_folio, }; -int erofs_init_managed_cache(struct super_block *sb) +int z_erofs_init_super(struct super_block *sb) { struct inode *const inode = new_inode(sb); if (!inode) return -ENOMEM; - set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; + xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) -{ - struct z_erofs_pcluster *const pcl = fe->pcl; - - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; -} - /* callers must be with pcluster lock held */ -static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, +static int z_erofs_attach_page(struct z_erofs_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) + spin_lock(&pcl->lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->lockref.lock); return 0; + } + spin_unlock(&pcl->lockref.lock); + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) @@ -757,52 +694,49 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) +static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *pcl = f->pcl; - z_erofs_next_pcluster_t *owned_head = &f->owned_head; - - /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) == Z_EROFS_PCLUSTER_NIL) { - *owned_head = &pcl->next; - /* so we can attach this pcluster to our submission chain. */ - f->mode = Z_EROFS_PCLUSTER_FOLLOWED; - return; + if (lockref_get_not_zero(&pcl->lockref)) + return true; + + spin_lock(&pcl->lockref.lock); + if (__lockref_is_dead(&pcl->lockref)) { + spin_unlock(&pcl->lockref.lock); + return false; } - /* type 2, it belongs to an ongoing chain */ - f->mode = Z_EROFS_PCLUSTER_INFLIGHT; + if (!pcl->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&pcl->lockref.lock); + return true; } -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) +static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); bool ztailpacking = map->m_flags & EROFS_MAP_META; - struct z_erofs_pcluster *pcl; - struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl, *pre; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); - spin_lock_init(&pcl->obj.lockref.lock); - pcl->obj.lockref.count = 1; /* one ref for this request */ + spin_lock_init(&pcl->lockref.lock); + pcl->lockref.count = 1; /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; - - /* new pclusters should be claimed as type 1, primary and followed */ - pcl->next = fe->owned_head; + pcl->next = fe->head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; @@ -814,26 +748,31 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { - pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->tailpacking_size = map->m_plen; + pcl->index = 0; /* which indicates ztailpacking */ } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; - - grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; + pcl->index = erofs_blknr(sb, map->m_pa); + while (1) { + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + NULL, pcl, GFP_KERNEL); + if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { + xa_unlock(&sbi->managed_pslots); + break; + } + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); } - - if (grp != &pcl->obj) { - fe->pcl = container_of(grp, - struct z_erofs_pcluster, obj); + if (xa_is_err(pre)) { + err = xa_err(pre); + goto err_out; + } else if (pre) { + fe->pcl = pre; err = -EEXIST; goto err_out; } } - fe->owned_head = &pcl->next; - fe->pcl = pcl; + fe->head = fe->pcl = pcl; return 0; err_out: @@ -842,28 +781,36 @@ err_out: return err; } -static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); - struct erofs_workgroup *grp = NULL; + struct z_erofs_pcluster *pcl = NULL; int ret; DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(!fe->head); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(sb, blknr); + while (1) { + rcu_read_lock(); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + if (!pcl || z_erofs_get_pcluster(pcl)) { + DBG_BUGON(pcl && blknr != pcl->index); + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + } } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; } - if (grp) { - fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (pcl) { + fe->pcl = pcl; ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); @@ -871,7 +818,14 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - z_erofs_try_to_claim_pcluster(fe); + /* check if this pcluster hasn't been linked into any chain. */ + if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) { + /* .. so it can be attached to our submission chain */ + fe->head = fe->pcl; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + } else { /* otherwise, it belongs to an inflight chain */ + fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; + } } else if (ret) { return ret; } @@ -884,7 +838,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } else { void *mptr; - mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); @@ -900,25 +854,93 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) return 0; } -/* - * keep in mind that no referenced pclusters will be freed - * only after a RCU grace period. - */ static void z_erofs_rcu_callback(struct rcu_head *head) { - z_erofs_free_pcluster(container_of(head, - struct z_erofs_pcluster, rcu)); + z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu)); +} + +static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) +{ + if (pcl->lockref.count) + return false; + + /* + * Note that all cached folios should be detached before deleted from + * the XArray. Otherwise some folios could be still attached to the + * orphan old pcluster when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_folios(sbi, pcl)) + return false; + + /* + * It's impossible to fail after the pcluster is freezed, but in order + * to avoid some race conditions, add a DBG_BUGON to observe this. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); + + lockref_mark_dead(&pcl->lockref); + return true; +} + +static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) +{ + bool free; + + spin_lock(&pcl->lockref.lock); + free = __erofs_try_to_release_pcluster(sbi, pcl); + spin_unlock(&pcl->lockref.lock); + if (free) { + atomic_long_dec(&erofs_global_shrink_cnt); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); + } + return free; +} + +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr) +{ + struct z_erofs_pcluster *pcl; + unsigned long index, freed = 0; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, pcl) { + /* try to shrink each valid pcluster */ + if (!erofs_try_to_release_pcluster(sbi, pcl)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; } -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, bool try_free) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + bool free = false; - call_rcu(&pcl->rcu, z_erofs_rcu_callback); + if (lockref_put_or_lock(&pcl->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&pcl->lockref)); + if (!--pcl->lockref.count) { + if (try_free && xa_trylock(&sbi->managed_pslots)) { + free = __erofs_try_to_release_pcluster(sbi, pcl); + xa_unlock(&sbi->managed_pslots); + } + atomic_long_add(!free, &erofs_global_shrink_cnt); + } + spin_unlock(&pcl->lockref.lock); + if (free) + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; @@ -931,17 +953,13 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - /* - * if all pending pages are added, don't hold its reference - * any longer if the pcluster isn't hosted by ourselves. - */ + /* Drop refcount if it doesn't belong to our processing chain */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) - erofs_workgroup_put(&pcl->obj); - + z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); fe->pcl = NULL; } -static int z_erofs_read_fragment(struct super_block *sb, struct page *page, +static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, unsigned int cur, unsigned int end, erofs_off_t pos) { struct inode *packed_inode = EROFS_SB(sb)->packed_inode; @@ -952,115 +970,110 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, if (!packed_inode) return -EFSCORRUPTED; - buf.inode = packed_inode; + buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { - cnt = min_t(unsigned int, end - cur, - sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); + cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); + src = erofs_bread(&buf, pos, EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); + memcpy_to_folio(folio, cur, src, cnt); } erofs_put_metabuf(&buf); return 0; } -static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) +static int z_erofs_scan_folio(struct z_erofs_frontend *f, + struct folio *folio, bool ra) { - struct inode *const inode = fe->inode; - struct erofs_map_blocks *const map = &fe->map; - const loff_t offset = page_offset(page); - bool tight = true, exclusive; - unsigned int cur, end, len, split; + struct inode *const inode = f->inode; + struct erofs_map_blocks *const map = &f->map; + const loff_t offset = folio_pos(folio); + const unsigned int bs = i_blocksize(inode); + unsigned int end = folio_size(folio), split = 0, cur, pgs; + bool tight, excl; int err = 0; - z_erofs_onlinepage_init(page); - - split = 0; - end = PAGE_SIZE; -repeat: - if (offset + end - 1 < map->m_la || - offset + end - 1 >= map->m_la + map->m_llen) { - z_erofs_pcluster_end(fe); - map->m_la = offset + end - 1; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto out; - } - - cur = offset > map->m_la ? 0 : map->m_la - offset; - /* bump split parts first to avoid several separate cases */ - ++split; - - if (!(map->m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, cur, end); - tight = false; - goto next_part; - } - - if (map->m_flags & EROFS_MAP_FRAGMENT) { - erofs_off_t fpos = offset + cur - map->m_la; - - len = min_t(unsigned int, map->m_llen - fpos, end - cur); - err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, - EROFS_I(inode)->z_fragmentoff + fpos); - if (err) - goto out; - tight = false; - goto next_part; - } + tight = (bs == PAGE_SIZE); + erofs_onlinefolio_init(folio); + do { + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + z_erofs_pcluster_end(f); + map->m_la = offset + end - 1; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + break; + } - if (!fe->pcl) { - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - } + cur = offset > map->m_la ? 0 : map->m_la - offset; + pgs = round_down(cur, PAGE_SIZE); + /* bump split parts first to avoid several separate cases */ + ++split; + + if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, end); + tight = false; + } else if (map->m_flags & __EROFS_MAP_FRAGMENT) { + erofs_off_t fpos = offset + cur - map->m_la; + + err = z_erofs_read_fragment(inode->i_sb, folio, cur, + cur + min(map->m_llen - fpos, end - cur), + EROFS_I(inode)->z_fragmentoff + fpos); + if (err) + break; + tight = false; + } else { + if (!f->pcl) { + err = z_erofs_pcluster_begin(f); + if (err) + break; + f->pcl->besteffort |= !ra; + } - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - exclusive = (!cur && ((split <= 1) || tight)); - if (cur) - tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); - - err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { - .page = page, - .offset = offset - map->m_la, - .end = end, - }), exclusive); - if (err) - goto out; - - z_erofs_onlinepage_split(page); - if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) - fe->pcl->multibases = true; - if (fe->pcl->length < offset + end - map->m_la) { - fe->pcl->length = offset + end - map->m_la; - fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; - } - if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && - !(map->m_flags & EROFS_MAP_PARTIAL_REF) && - fe->pcl->length == map->m_llen) - fe->pcl->partial = false; -next_part: - /* shorten the remaining extent to update progress */ - map->m_llen = offset + cur - map->m_la; - map->m_flags &= ~EROFS_MAP_FULL_MAPPED; - - end = cur; - if (end > 0) - goto repeat; + pgs = round_down(end - 1, PAGE_SIZE); + /* + * Ensure this partial page belongs to this submit chain + * rather than other concurrent submit chains or + * noio(bypass) chains since those chains are handled + * asynchronously thus it cannot be used for inplace I/O + * or bvpage (should be processed in the strict order.) + */ + tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED); + excl = false; + if (cur <= pgs) { + excl = (split <= 1) || tight; + cur = pgs; + } -out: - z_erofs_onlinepage_endio(page, err); + err = z_erofs_attach_page(f, &((struct z_erofs_bvec) { + .page = folio_page(folio, pgs >> PAGE_SHIFT), + .offset = offset + pgs - map->m_la, + .end = end - pgs, }), excl); + if (err) + break; + + erofs_onlinefolio_split(folio); + if (f->pcl->length < offset + end - map->m_la) { + f->pcl->length = offset + end - map->m_la; + f->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + f->pcl->length == map->m_llen) + f->pcl->partial = false; + } + /* shorten the remaining extent to update progress */ + map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; + if (cur <= pgs) { + split = cur < pgs; + tight = (bs == PAGE_SIZE); + } + } while ((end = cur) > 0); + erofs_onlinefolio_end(folio, err, false); return err; } @@ -1081,14 +1094,13 @@ static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, static bool z_erofs_page_is_invalidated(struct page *page) { - return !page->mapping && !z_erofs_is_shortlived_page(page); + return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page); } -struct z_erofs_decompress_backend { +struct z_erofs_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; - /* pages with the longest decompressed length for deduplication */ struct page **decompressed_pages; /* pages to keep the compressed data */ @@ -1097,6 +1109,8 @@ struct z_erofs_decompress_backend { struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; + /* indicate if temporary copies should be preserved for later use */ + bool keepxcpy; }; struct z_erofs_bvec_item { @@ -1104,21 +1118,23 @@ struct z_erofs_bvec_item { struct list_head list; }; -static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, +static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be, struct z_erofs_bvec *bvec) { + int poff = bvec->offset + be->pcl->pageofs_out; struct z_erofs_bvec_item *item; - unsigned int pgnr; - - if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && - (bvec->end == PAGE_SIZE || - bvec->offset + bvec->end == be->pcl->length)) { - pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= be->nr_pages); - if (!be->decompressed_pages[pgnr]) { - be->decompressed_pages[pgnr] = bvec->page; + struct page **page; + + if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || + bvec->offset + bvec->end == be->pcl->length)) { + DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages); + page = be->decompressed_pages + (poff >> PAGE_SHIFT); + if (!*page) { + *page = bvec->page; return; } + } else { + be->keepxcpy = true; } /* (cold path) one pcluster is requested multiple times */ @@ -1127,8 +1143,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, list_add(&item->list, &be->decompressed_secondary_bvecs); } -static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, - int err) +static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err) { unsigned int off0 = be->pcl->pageofs_out; struct list_head *p, *n; @@ -1163,13 +1178,13 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - z_erofs_onlinepage_endio(bvi->bvec.page, err); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true); list_del(p); kfree(bvi); } } -static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; @@ -1194,8 +1209,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); } -static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, - bool *overlapped) +static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); @@ -1206,48 +1220,41 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - /* compressed pages ought to be present before decompressing */ - if (!page) { - DBG_BUGON(1); + /* compressed data ought to be valid when decompressing */ + if (IS_ERR(page) || !page) { + bvec->page = NULL; /* clear the failure reason */ + err = page ? PTR_ERR(page) : -EIO; continue; } be->compressed_pages[i] = page; - if (z_erofs_is_inline_pcluster(pcl)) { + if (z_erofs_is_inline_pcluster(pcl) || + erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; continue; } DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - z_erofs_do_decompressed_bvec(be, bvec); - *overlapped = true; - } + if (z_erofs_is_shortlived_page(page)) + continue; + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; } - - if (err) - return err; - return 0; + return err; } -static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, - int err) +static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) { struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - const struct z_erofs_decompressor *decompressor = - &erofs_decompressors[pcl->algorithmformat]; - unsigned int i, inputsize; - int err2; + const struct z_erofs_decompressor *decomp = + z_erofs_decomp[pcl->algorithmformat]; + int i, j, jtop, err2; struct page *page; bool overlapped; + bool try_free = true; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; @@ -1279,41 +1286,38 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - - err = decompressor->decompress(&(struct z_erofs_decompress_req) { + if (!err) + err = decomp->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, - .fillgaps = pcl->multibases, + .fillgaps = be->keepxcpy, + .gfp = pcl->besteffort ? GFP_KERNEL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); -out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { page = pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { + /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { - /* consider shortlived pages added when decompressing */ page = be->compressed_pages[i]; - - if (erofs_page_is_managed(sbi, page)) + if (!page) continue; + if (erofs_folio_is_managed(sbi, page_folio(page))) { + try_free = false; + continue; + } (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } @@ -1321,59 +1325,70 @@ out: if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); - z_erofs_fill_other_copies(be, err); + jtop = 0; + z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); - - /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(be->pagepool, page)) + if (!z_erofs_is_shortlived_page(page)) { + erofs_onlinefolio_end(page_folio(page), err, true); continue; - z_erofs_onlinepage_endio(page, err); + } + if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { + erofs_pagepool_add(be->pagepool, page); + continue; + } + for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j) + ; + if (j >= jtop) /* this bounce page is newly detected */ + be->decompressed_pages[jtop++] = page; } - + while (jtop) + erofs_pagepool_add(be->pagepool, + be->decompressed_pages[--jtop]); if (be->decompressed_pages != be->onstack_pages) kvfree(be->decompressed_pages); pcl->length = 0; pcl->partial = true; - pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); + + if (z_erofs_is_inline_pcluster(pcl)) + z_erofs_free_pcluster(pcl); + else + z_erofs_put_pcluster(sbi, pcl, try_free); return err; } -static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct page **pagepool) +static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct page **pagepool) { - struct z_erofs_decompress_backend be = { + struct z_erofs_backend be = { .sb = io->sb, .pagepool = pagepool, .decompressed_secondary_bvecs = LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + .pcl = io->head, }; - z_erofs_next_pcluster_t owned = io->head; + struct z_erofs_pcluster *next; + int err = io->eio ? -EIO : 0; - while (owned != Z_EROFS_PCLUSTER_TAIL) { - DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - - be.pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(be.pcl->next); - - z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); - if (z_erofs_is_inline_pcluster(be.pcl)) - z_erofs_free_pcluster(be.pcl); - else - erofs_workgroup_put(&be.pcl->obj); + for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) { + DBG_BUGON(!be.pcl); + next = READ_ONCE(be.pcl->next); + err = z_erofs_decompress_pcluster(&be, err) ?: err; } + return err; } static void z_erofs_decompressqueue_work(struct work_struct *work) @@ -1435,113 +1450,112 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *oldpage, *page; - int justfound; + struct folio *folio; + struct page *page; + int bs = i_blocksize(f->inode); + /* Except for inplace folios, the entire folio can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) - goto out_allocpage; + spin_lock(&pcl->lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + spin_unlock(&pcl->lockref.lock); + if (!zbv.page) + goto out_allocfolio; - justfound = (unsigned long)page & 1UL; - page = (struct page *)((unsigned long)page & ~1UL); + bvec->bv_page = zbv.page; + DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page)); + folio = page_folio(zbv.page); /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. + * Handle preallocated cached folios. We tried to allocate such folios + * without triggering direct reclaim. If allocation failed, inplace + * file-backed folios will be used instead. */ - if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - set_page_private(page, 0); + if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { tocache = true; goto out_tocache; } - mapping = READ_ONCE(page->mapping); + mapping = READ_ONCE(folio->mapping); /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed folios for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; - - lock_page(page); - - /* only true if page reclaim goes wrong, should never happen */ - DBG_BUGON(justfound && PagePrivate(page)); - - /* the page is still in manage cache */ - if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - - if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ - DBG_BUGON(!justfound); + if (mapping && mapping != mc) { + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; + return; + } - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + folio_lock(folio); + if (likely(folio->mapping == mc)) { + /* + * The cached folio is still in managed cache but without + * a valid `->private` pcluster hint. Let's reconnect them. + */ + if (!folio_test_private(folio)) { + folio_attach_private(folio, pcl); + /* compressed_bvecs[] already takes a ref before */ + folio_put(folio); } - - /* no need to submit io if it is already up-to-date */ - if (PageUptodate(page)) { - unlock_page(page); - page = NULL; + if (likely(folio->private == pcl)) { + /* don't submit cache I/Os again if already uptodate */ + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + bvec->bv_page = NULL; + } + return; } - goto out; + /* + * Already linked with another pcluster, which only appears in + * crafted images by fuzzers for now. But handle this anyway. + */ + tocache = false; /* use temporary short-lived pages */ + } else { + DBG_BUGON(1); /* referenced managed folios can't be truncated */ + tocache = true; } - - /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. - */ - DBG_BUGON(page->mapping); - DBG_BUGON(!justfound); - - tocache = true; - unlock_page(page); - put_page(page); -out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + folio_unlock(folio); + folio_put(folio); +out_allocfolio: + page = __erofs_allocpage(&f->pagepool, gfp, true); + spin_lock(&pcl->lockref.lock); + if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { + if (page) + erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); + spin_unlock(&pcl->lockref.lock); + bvec->bv_page = page; + if (!page) + return; + folio = page_folio(page); out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ - set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + if (!tocache || bs != PAGE_SIZE || + filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { + /* turn into a temporary shortlived folio (1 ref) */ + folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; + return; } - attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ - put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; + folio_attach_private(folio, pcl); + /* drop a refcount added by allocpage (then 2 refs in total here) */ + folio_put(folio); } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1581,67 +1595,58 @@ enum { NR_JOBQUEUES, }; -static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, - z_erofs_next_pcluster_t qtail[], - z_erofs_next_pcluster_t owned_head) +static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl, + struct z_erofs_pcluster *next, + struct z_erofs_pcluster **qtail[]) { - z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; - z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); - - WRITE_ONCE(*submit_qtail, owned_head); - WRITE_ONCE(*bypass_qtail, &pcl->next); - + WRITE_ONCE(*qtail[JQ_SUBMIT], next); + WRITE_ONCE(*qtail[JQ_BYPASS], pcl); qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(z_erofs_page_is_invalidated(page)); + DBG_BUGON(folio_test_uptodate(folio)); + DBG_BUGON(z_erofs_page_is_invalidated(&folio->page)); + if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio)) + continue; - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } + if (!err) + folio_mark_uptodate(folio); + folio_unlock(folio); } if (err) q->eio = true; z_erofs_decompress_kickoff(q, -1); - bio_put(bio); + if (bio->bi_bdev) + bio_put(bio); } -static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_frontend *f, struct z_erofs_decompressqueue *fgq, bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); - z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_pcluster **qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - z_erofs_next_pcluster_t owned_head = f->owned_head; + struct z_erofs_pcluster *pcl, *next; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; - struct block_device *last_bdev; + erofs_off_t last_pa; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ + /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); @@ -1649,45 +1654,42 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ - q[JQ_SUBMIT]->head = owned_head; + q[JQ_SUBMIT]->head = next = f->head; do { struct erofs_map_dev mdev; - struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned_head, struct z_erofs_pcluster, next); - owned_head = READ_ONCE(pcl->next); - + pcl = next; + next = READ_ONCE(pcl->next); if (z_erofs_is_inline_pcluster(pcl)) { - move_to_bypass_jobqueue(pcl, qtail, owned_head); + z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->obj.index), + .m_pa = erofs_pos(sb, pcl->index), }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(sb, mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = cur + pcl->pclustersize; do { - struct page *page; + bvec.bv_page = NULL; + if (bio && (cur != last_pa || + bio->bi_bdev != mdev.m_bdev)) { +drain_io: + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) + erofs_fscache_submit_bio(bio); + else + submit_bio(bio); - page = pickup_page_for_submission(pcl, i++, - &f->pagepool, mc); - if (!page) - continue; - - if (bio && (cur != last_index + 1 || - last_bdev != mdev.m_bdev)) { -submit_bio_retry: - submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); memstall = 0; @@ -1695,43 +1697,60 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (!bvec.bv_page) { + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) + continue; + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); + } + + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } if (!bio) { - bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, - REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - (sb->s_blocksize_bits - 9); + if (erofs_is_fileio_mode(EROFS_SB(sb))) + bio = erofs_fileio_bio_alloc(&mdev); + else if (erofs_is_fscache_mode(sb)) + bio = erofs_fscache_bio_alloc(&mdev); + else + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); + bio->bi_end_io = z_erofs_endio; + bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) - goto submit_bio_retry; - - last_index = cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) + goto drain_io; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else - move_to_bypass_jobqueue(pcl, qtail, owned_head); - } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + z_erofs_move_to_bypass_queue(pcl, next, qtail); + } while (next != Z_EROFS_PCLUSTER_TAIL); if (bio) { - submit_bio(bio); - if (memstall) - psi_memstall_leave(&pflags); + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) + erofs_fscache_submit_bio(bio); + else + submit_bio(bio); } + if (memstall) + psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. @@ -1744,33 +1763,34 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - bool force_fg, bool ra) +static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; + struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); + bool force_fg = z_erofs_is_sync_decompress(sbi, rapages); + int err; - if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) - return; - z_erofs_submit_queue(f, io, &force_fg, ra); + if (f->head == Z_EROFS_PCLUSTER_TAIL) + return 0; + z_erofs_submit_queue(f, io, &force_fg, !!rapages); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); - + err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) - return; + return err; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); + return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err; } /* * Since partial uptodate is still unimplemented for now, we have to use * approximate readmore strategies as a start. */ -static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, +static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; @@ -1798,7 +1818,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); - if (!map->m_llen) return; } @@ -1806,15 +1825,15 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, cur = map->m_la + map->m_llen - 1; while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; - struct page *page; + struct folio *folio; - page = erofs_grab_cache_page_nowait(inode->i_mapping, index); - if (page) { - if (PageUptodate(page)) - unlock_page(page); + folio = erofs_grab_folio_nowait(inode->i_mapping, index); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_uptodate(folio)) + folio_unlock(folio); else - (void)z_erofs_do_read_page(f, page); - put_page(page); + z_erofs_scan_folio(f, folio, !!rac); + folio_put(folio); } if (cur < PAGE_SIZE) @@ -1826,21 +1845,17 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio)); int err; trace_erofs_read_folio(folio, false); - f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_scan_folio(&f, folio, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); - /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - + /* if some pclusters are ready, need submit them anyway */ + err = z_erofs_runqueue(&f, 0) ?: err; if (err && err != -EINTR) erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", err, folio->index, EROFS_I(inode)->nid); @@ -1853,18 +1868,13 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac)); + unsigned int nrpages = readahead_count(rac); struct folio *head = NULL, *folio; - unsigned int nr_folios; int err; - f.headoffset = readahead_pos(rac); - + trace_erofs_readahead(inode, readahead_index(rac), nrpages, false); z_erofs_pcluster_readmore(&f, rac, true); - nr_folios = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); - while ((folio = readahead_folio(rac))) { folio->private = head; head = folio; @@ -1875,7 +1885,7 @@ static void z_erofs_readahead(struct readahead_control *rac) folio = head; head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_scan_folio(&f, folio, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); @@ -1883,7 +1893,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); + (void)z_erofs_runqueue(&f, nrpages); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 76566c2cbf63..25a4b82c183c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -4,14 +4,12 @@ * https://www.huawei.com/ */ #include "internal.h" -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <trace/events/erofs.h> struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; - void *kaddr; - unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; @@ -31,22 +29,17 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; - unsigned int advise, type; - - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(inode->i_sb, pos), EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); + unsigned int advise; - m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(di)) + return PTR_ERR(di); m->lcn = lcn; - di = m->kaddr + erofs_blkoff(inode->i_sb, pos); + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); - type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) & - ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1); - switch (type) { - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { @@ -55,29 +48,19 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = m->delta[0] & - ~Z_EROFS_LI_D0_CBLKCNT; + m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); - break; - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: - if (advise & Z_EROFS_LI_PARTIAL_REF) - m->partialref = true; + } else { + m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } m->pblk = le32_to_cpu(di->di_u.blkaddr); - break; - default: - DBG_BUGON(1); - return -EOPNOTSUPP; } - m->type = type; return 0; } @@ -114,17 +97,48 @@ static int get_compacted_la_distance(unsigned int lobits, return d1; } -static int unpack_compacted_index(struct z_erofs_maprecorder *m, - unsigned int amortizedshift, - erofs_off_t pos, bool lookahead) +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) { - struct erofs_inode *const vi = EROFS_I(m->inode); + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int lclusterbits = vi->z_logical_clusterbits; - unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; - int i; + const unsigned int totalidx = erofs_iblks(inode); + unsigned int compacted_4b_initial, compacted_2b, amortizedshift; + unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; + bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + erofs_off_t pos; u8 *in, type; - bool big_pcluster; + int i; + + if (lcn >= totalidx || lclusterbits > 14) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = ((32 - ebase % 32) / 4) & 7; + compacted_2b = 0; + if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && + compacted_4b_initial < totalidx) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + + pos = ebase; + amortizedshift = 2; /* compact_4b */ + if (lcn >= compacted_4b_initial) { + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + if (lcn < compacted_2b) { + amortizedshift = 1; + } else { + pos += compacted_2b * 2; + lcn -= compacted_2b; + } + } + pos += lcn * (1 << amortizedshift); + /* figure out the lcluster count in this pack */ if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits <= 12) @@ -132,17 +146,18 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(in)) + return PTR_ERR(in); + /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); - big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; - eofs = erofs_blkoff(m->inode->i_sb, pos); - base = round_down(eofs, vcnt << amortizedshift); - in = m->kaddr + base; - - i = (eofs - base) >> amortizedshift; + bytes = pos & ((vcnt << amortizedshift) - 1); + in -= bytes; + i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; @@ -222,57 +237,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, return 0; } -static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, - unsigned long lcn, bool lookahead) -{ - struct inode *const inode = m->inode; - struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + - ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - unsigned int totalidx = erofs_iblks(inode); - unsigned int compacted_4b_initial, compacted_2b; - unsigned int amortizedshift; - erofs_off_t pos; - - if (lcn >= totalidx || vi->z_logical_clusterbits > 14) - return -EINVAL; - - m->lcn = lcn; - /* used to align to 32-byte (compacted_2b) alignment */ - compacted_4b_initial = (32 - ebase % 32) / 4; - if (compacted_4b_initial == 32 / 4) - compacted_4b_initial = 0; - - if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && - compacted_4b_initial < totalidx) - compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); - else - compacted_2b = 0; - - pos = ebase; - if (lcn < compacted_4b_initial) { - amortizedshift = 2; - goto out; - } - pos += compacted_4b_initial * 4; - lcn -= compacted_4b_initial; - - if (lcn < compacted_2b) { - amortizedshift = 1; - goto out; - } - pos += compacted_2b * 2; - lcn -= compacted_2b; - amortizedshift = 2; -out: - pos += lcn * (1 << amortizedshift); - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(inode->i_sb, pos), EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - return unpack_compacted_index(m, amortizedshift, pos, lookahead); -} - static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, unsigned int lcn, bool lookahead) { @@ -301,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, if (err) return err; - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; if (!lookback_distance) - goto err_bogus; + break; continue; - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + } else { m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; - default: - erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } } -err_bogus: erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); @@ -330,27 +290,23 @@ err_bogus: static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, unsigned int initial_lcn) { - struct super_block *sb = m->inode->i_sb; - struct erofs_inode *const vi = EROFS_I(m->inode); - struct erofs_map_blocks *const map = m->map; - const unsigned int lclusterbits = vi->z_logical_clusterbits; - unsigned long lcn; + struct inode *inode = m->inode; + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2; + unsigned long lcn = m->lcn + 1; int err; - DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 && - m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); DBG_BUGON(m->type != m->headtype); - if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || - ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || - ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { - map->m_plen = 1ULL << lclusterbits; - return 0; - } - lcn = m->lcn + 1; + if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || + m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || + (lcn << vi->z_logical_clusterbits) >= inode->i_size) + m->compressedblks = 1; + if (m->compressedblks) goto out; @@ -369,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (m->delta[0] != 1) { + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + if (m->compressedblks) + goto out; + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type - * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. + * rather than CBLKCNT, it's a 1 block-sized pcluster. */ - m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits); - break; - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (m->delta[0] != 1) - goto err_bonus_cblkcnt; - if (m->compressedblks) - break; - fallthrough; - default: - erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; + m->compressedblks = 1; + goto out; } -out: - map->m_plen = erofs_pos(sb, m->compressedblks); - return 0; -err_bonus_cblkcnt: - erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; +out: + m->map->m_plen = erofs_pos(sb, m->compressedblks); + return 0; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) @@ -426,9 +375,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) m->delta[1] = 1; DBG_BUGON(1); } - } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; @@ -447,9 +394,10 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) static int z_erofs_do_map_blocks(struct inode *inode, struct erofs_map_blocks *map, int flags) { - struct erofs_inode *const vi = EROFS_I(inode); - bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + bool ztailpacking = vi->z_idata_size; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -468,9 +416,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (err) goto unmap_out; - if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) - vi->z_idataoff = m.nextpackoff; - + if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking) + vi->z_fragmentoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; @@ -492,8 +439,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, } /* m.lcn should be >= 1 if endoff < m.clusterofs */ if (!m.lcn) { - erofs_err(inode->i_sb, - "invalid logical cluster 0 at nid %llu", + erofs_err(sb, "invalid logical cluster 0 at nid %llu", vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -509,8 +455,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto unmap_out; break; default: - erofs_err(inode->i_sb, - "unknown type %u @ offset %llu of nid %llu", + erofs_err(sb, "unknown type %u @ offset %llu of nid %llu", m.type, ofs, vi->nid); err = -EOPNOTSUPP; goto unmap_out; @@ -527,12 +472,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; - map->m_pa = vi->z_idataoff; + map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map->m_plen); + err = -EFSCORRUPTED; + goto unmap_out; + } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { - map->m_flags |= EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; } else { - map->m_pa = erofs_pos(inode->i_sb, m.pblk); + map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; @@ -551,7 +502,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", afmt, vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -562,7 +513,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || - map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE || + map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) && map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) @@ -581,7 +533,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) int err, headnr; erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; struct z_erofs_map_header *h; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { @@ -601,13 +552,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); + h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); + if (IS_ERR(h)) { + err = PTR_ERR(h); goto out_unlock; } - h = kaddr + erofs_blkoff(sb, pos); /* * if the highest bit of the 8-byte map header is set, the whole file * is stored in the packed inode. The rest bits keeps z_fragmentoff. @@ -621,6 +571,10 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) + vi->z_idata_size = le16_to_cpu(h->h_idata_size); headnr = 0; if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || @@ -649,33 +603,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + if (vi->z_idata_size || + (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); err = z_erofs_do_map_blocks(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); @@ -699,34 +632,31 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, struct erofs_inode *const vi = EROFS_I(inode); int err = 0; - trace_z_erofs_map_blocks_iter_enter(inode, map, flags); - - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { + trace_erofs_map_blocks_enter(inode, map, flags); + if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ map->m_llen = map->m_la + 1 - inode->i_size; map->m_la = inode->i_size; map->m_flags = 0; - goto out; - } - - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; - - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | - EROFS_MAP_FRAGMENT; - goto out; + } else { + err = z_erofs_fill_inode_lazy(inode); + if (!err) { + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_FRAGMENT; + } else { + err = z_erofs_do_map_blocks(inode, map, flags); + } + } + if (!err && (map->m_flags & EROFS_MAP_ENCODED) && + unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + err = -EOPNOTSUPP; + if (err) + map->m_llen = 0; } - - err = z_erofs_do_map_blocks(inode, map, flags); -out: - if (err) - map->m_llen = 0; - trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } @@ -747,7 +677,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ? IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c new file mode 100644 index 000000000000..0dd65cefce33 --- /dev/null +++ b/fs/erofs/zutil.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2024 Alibaba Cloud + */ +#include "internal.h" + +struct z_erofs_gbuf { + spinlock_t lock; + void *ptr; + struct page **pages; + unsigned int nrpages; +}; + +static struct z_erofs_gbuf *z_erofs_gbufpool, *z_erofs_rsvbuf; +static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages, + z_erofs_rsv_nrpages; + +module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); +module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444); + +atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ + +/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); +static unsigned int shrinker_run_no; +static struct shrinker *erofs_shrinker_info; + +static unsigned int z_erofs_gbuf_id(void) +{ + return raw_smp_processor_id() % z_erofs_gbuf_count; +} + +void *z_erofs_get_gbuf(unsigned int requiredpages) + __acquires(gbuf->lock) +{ + struct z_erofs_gbuf *gbuf; + + migrate_disable(); + gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; + spin_lock(&gbuf->lock); + /* check if the buffer is too small */ + if (requiredpages > gbuf->nrpages) { + spin_unlock(&gbuf->lock); + migrate_enable(); + /* (for sparse checker) pretend gbuf->lock is still taken */ + __acquire(gbuf->lock); + return NULL; + } + return gbuf->ptr; +} + +void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock) +{ + struct z_erofs_gbuf *gbuf; + + gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; + DBG_BUGON(gbuf->ptr != ptr); + spin_unlock(&gbuf->lock); + migrate_enable(); +} + +int z_erofs_gbuf_growsize(unsigned int nrpages) +{ + static DEFINE_MUTEX(gbuf_resize_mutex); + struct page **tmp_pages = NULL; + struct z_erofs_gbuf *gbuf; + void *ptr, *old_ptr; + int last, i, j; + + mutex_lock(&gbuf_resize_mutex); + /* avoid shrinking gbufs, since no idea how many fses rely on */ + if (nrpages <= z_erofs_gbuf_nrpages) { + mutex_unlock(&gbuf_resize_mutex); + return 0; + } + + for (i = 0; i < z_erofs_gbuf_count; ++i) { + gbuf = &z_erofs_gbufpool[i]; + tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL); + if (!tmp_pages) + goto out; + + for (j = 0; j < gbuf->nrpages; ++j) + tmp_pages[j] = gbuf->pages[j]; + do { + last = j; + j = alloc_pages_bulk_array(GFP_KERNEL, nrpages, + tmp_pages); + if (last == j) + goto out; + } while (j != nrpages); + + ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); + if (!ptr) + goto out; + + spin_lock(&gbuf->lock); + kfree(gbuf->pages); + gbuf->pages = tmp_pages; + old_ptr = gbuf->ptr; + gbuf->ptr = ptr; + gbuf->nrpages = nrpages; + spin_unlock(&gbuf->lock); + if (old_ptr) + vunmap(old_ptr); + } + z_erofs_gbuf_nrpages = nrpages; +out: + if (i < z_erofs_gbuf_count && tmp_pages) { + for (j = 0; j < nrpages; ++j) + if (tmp_pages[j] && (j >= gbuf->nrpages || + tmp_pages[j] != gbuf->pages[j])) + __free_page(tmp_pages[j]); + kfree(tmp_pages); + } + mutex_unlock(&gbuf_resize_mutex); + return i < z_erofs_gbuf_count ? -ENOMEM : 0; +} + +int __init z_erofs_gbuf_init(void) +{ + unsigned int i, total = num_possible_cpus(); + + if (z_erofs_gbuf_count) + total = min(z_erofs_gbuf_count, total); + z_erofs_gbuf_count = total; + + /* The last (special) global buffer is the reserved buffer */ + total += !!z_erofs_rsv_nrpages; + + z_erofs_gbufpool = kcalloc(total, sizeof(*z_erofs_gbufpool), + GFP_KERNEL); + if (!z_erofs_gbufpool) + return -ENOMEM; + + if (z_erofs_rsv_nrpages) { + z_erofs_rsvbuf = &z_erofs_gbufpool[total - 1]; + z_erofs_rsvbuf->pages = kcalloc(z_erofs_rsv_nrpages, + sizeof(*z_erofs_rsvbuf->pages), GFP_KERNEL); + if (!z_erofs_rsvbuf->pages) { + z_erofs_rsvbuf = NULL; + z_erofs_rsv_nrpages = 0; + } + } + for (i = 0; i < total; ++i) + spin_lock_init(&z_erofs_gbufpool[i].lock); + return 0; +} + +void z_erofs_gbuf_exit(void) +{ + int i, j; + + for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) { + struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i]; + + if (gbuf->ptr) { + vunmap(gbuf->ptr); + gbuf->ptr = NULL; + } + + if (!gbuf->pages) + continue; + + for (j = 0; j < gbuf->nrpages; ++j) + if (gbuf->pages[j]) + put_page(gbuf->pages[j]); + kfree(gbuf->pages); + gbuf->pages = NULL; + } + kfree(z_erofs_gbufpool); +} + +struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv) +{ + struct page *page = *pagepool; + + if (page) { + *pagepool = (struct page *)page_private(page); + } else if (tryrsv && z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages) { + spin_lock(&z_erofs_rsvbuf->lock); + if (z_erofs_rsvbuf->nrpages) + page = z_erofs_rsvbuf->pages[--z_erofs_rsvbuf->nrpages]; + spin_unlock(&z_erofs_rsvbuf->lock); + } + if (!page) + page = alloc_page(gfp); + DBG_BUGON(page && page_ref_count(page) != 1); + return page; +} + +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + /* try to fill reserved global pool first */ + if (z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages < + z_erofs_rsv_nrpages) { + spin_lock(&z_erofs_rsvbuf->lock); + if (z_erofs_rsvbuf->nrpages < z_erofs_rsv_nrpages) { + z_erofs_rsvbuf->pages[z_erofs_rsvbuf->nrpages++] + = page; + spin_unlock(&z_erofs_rsvbuf->lock); + continue; + } + spin_unlock(&z_erofs_rsvbuf->lock); + } + put_page(page); + } +} + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + while (!xa_empty(&sbi->managed_pslots)) { + z_erofs_shrink_scan(sbi, ~0UL); + cond_resched(); + } + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + freed += z_erofs_shrink_scan(sbi, nr - freed); + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +int __init erofs_init_shrinker(void) +{ + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + shrinker_register(erofs_shrinker_info); + return 0; +} + +void erofs_exit_shrinker(void) +{ + shrinker_free(erofs_shrinker_info); +} |