summaryrefslogtreecommitdiff
path: root/fs/erofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig30
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h35
-rw-r--r--fs/erofs/data.c252
-rw-r--r--fs/erofs/decompressor.c102
-rw-r--r--fs/erofs/decompressor_crypto.c181
-rw-r--r--fs/erofs/decompressor_deflate.c28
-rw-r--r--fs/erofs/decompressor_lzma.c8
-rw-r--r--fs/erofs/decompressor_zstd.c8
-rw-r--r--fs/erofs/dir.c9
-rw-r--r--fs/erofs/erofs_fs.h194
-rw-r--r--fs/erofs/fileio.c29
-rw-r--r--fs/erofs/fscache.c18
-rw-r--r--fs/erofs/inode.c137
-rw-r--r--fs/erofs/internal.h100
-rw-r--r--fs/erofs/namei.c2
-rw-r--r--fs/erofs/super.c296
-rw-r--r--fs/erofs/sysfs.c86
-rw-r--r--fs/erofs/xattr.c16
-rw-r--r--fs/erofs/zdata.c632
-rw-r--r--fs/erofs/zmap.c419
-rw-r--r--fs/erofs/zutil.c164
22 files changed, 1463 insertions, 1284 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6ea60661fa55..6beeb7063871 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,8 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CRC32
select FS_IOMAP
- select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
@@ -13,12 +13,12 @@ config EROFS_FS
smartphones with Android OS, LiveCDs and high-density hosts with
numerous containers;
- It also provides fixed-sized output compression support in order to
- improve storage density as well as keep relatively higher compression
- ratios and implements in-place decompression to reuse the file page
- for compressed data temporarily with proper strategies, which is
- quite useful to ensure guaranteed end-to-end runtime decompression
- performance under extremely memory pressure without extra cost.
+ It also provides transparent compression and deduplication support to
+ improve storage density and maintain relatively high compression
+ ratios, and it implements in-place decompression to temporarily reuse
+ page cache for compressed data using proper strategies, which is
+ quite useful for ensuring guaranteed end-to-end runtime decompression
+ performance under extreme memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
and the web pages at <https://erofs.docs.kernel.org> for more details.
@@ -97,7 +97,7 @@ config EROFS_FS_ZIP
select LZ4_DECOMPRESS
default y
help
- Enable fixed-sized output compression for EROFS.
+ Enable transparent compression support for EROFS file systems.
If you don't want to enable compression feature, say N.
@@ -144,6 +144,20 @@ config EROFS_FS_ZIP_ZSTD
If unsure, say N.
+config EROFS_FS_ZIP_ACCEL
+ bool "EROFS hardware decompression support"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here includes hardware accelerator support for reading
+ EROFS file systems containing compressed data. It gives better
+ decompression speed than the software-implemented decompression, and
+ it costs lower CPU overhead.
+
+ Hardware accelerator support is an experimental feature for now and
+ file systems are still readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support (deprecated)"
depends on EROFS_FS
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 4331d53c7109..549abc424763 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -7,5 +7,6 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 7bfe251680ec..510e922c5193 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,6 +11,7 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
+ unsigned int inpages, outpages;
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
@@ -29,29 +30,8 @@ struct z_erofs_decompressor {
char *name;
};
-/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
-#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
-
-/*
- * For all pages in a pcluster, page->private should be one of
- * Type Last 2bits page->private
- * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE
- * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE
- * cached/managed page 00 pointer to z_erofs_pcluster
- * online page (file-backed, 01/10/11 sub-index << 2 | count
- * some pages can be used for inplace I/O)
- *
- * page->mapping should be one of
- * Type page->mapping
- * short-lived page NULL
- * preallocated page NULL
- * cached/managed page non-NULL or NULL (invalidated/truncated page)
- * online page non-NULL
- *
- * For all managed pages, PG_private should be set with 1 extra refcount,
- * which is used for page reclaim / migration.
- */
+#define Z_EROFS_PREALLOCATED_FOLIO ((void *)(-2UL << 2))
/*
* Currently, short-lived pages are pages directly from buddy system
@@ -80,7 +60,6 @@ extern const struct z_erofs_decompressor *z_erofs_decomp[];
struct z_erofs_stream_dctx {
struct z_erofs_decompress_req *rq;
- unsigned int inpages, outpages; /* # of {en,de}coded pages */
int no, ni; /* the current {en,de}coded page # */
unsigned int avail_out; /* remaining bytes in the decoded buffer */
@@ -97,4 +76,14 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
int __init z_erofs_init_decompressor(void);
void z_erofs_exit_decompressor(void);
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl);
+int z_erofs_crypto_enable_engine(const char *name, int len);
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+void z_erofs_crypto_disable_all_engines(void);
+int z_erofs_crypto_show_engines(char *buf, int size, char sep);
+#else
+static inline void z_erofs_crypto_disable_all_engines(void) {}
+static inline int z_erofs_crypto_show_engines(char *buf, int size, char sep) { return 0; }
+#endif
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 61debd799cf9..6a329c329f43 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -10,10 +10,10 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- if (buf->kmap_type == EROFS_KMAP)
- kunmap_local(buf->base);
+ if (!buf->base)
+ return;
+ kunmap_local(buf->base);
buf->base = NULL;
- buf->kmap_type = EROFS_NO_KMAP;
}
void erofs_put_metabuf(struct erofs_buf *buf)
@@ -25,10 +25,9 @@ void erofs_put_metabuf(struct erofs_buf *buf)
buf->page = NULL;
}
-void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
- enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
- pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t index = (buf->off + offset) >> PAGE_SHIFT;
struct folio *folio = NULL;
if (buf->page) {
@@ -38,22 +37,15 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
}
if (!folio || !folio_contains(folio, index)) {
erofs_put_metabuf(buf);
- folio = read_mapping_folio(buf->mapping, index, NULL);
+ folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
}
buf->page = folio_file_page(folio, index);
-
- if (buf->kmap_type == EROFS_NO_KMAP) {
- if (type == EROFS_KMAP)
- buf->base = kmap_local_page(buf->page);
- buf->kmap_type = type;
- } else if (buf->kmap_type != type) {
- DBG_BUGON(1);
- return ERR_PTR(-EFAULT);
- }
- if (type == EROFS_NO_KMAP)
+ if (!need_kmap)
return NULL;
+ if (!buf->base)
+ buf->base = kmap_local_page(buf->page);
return buf->base + (offset & ~PAGE_MASK);
}
@@ -61,73 +53,59 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fileio_mode(sbi))
- buf->mapping = file_inode(sbi->fdev)->i_mapping;
- else if (erofs_is_fscache_mode(sb))
- buf->mapping = sbi->s_fscache->inode->i_mapping;
+ buf->file = NULL;
+ buf->off = sbi->dif0.fsoff;
+ if (erofs_is_fileio_mode(sbi)) {
+ buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
+ buf->mapping = buf->file->f_mapping;
+ } else if (erofs_is_fscache_mode(sb))
+ buf->mapping = sbi->dif0.fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, enum erofs_kmap_type type)
+ erofs_off_t offset, bool need_kmap)
{
erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, offset, type);
-}
-
-static int erofs_map_blocks_flatmode(struct inode *inode,
- struct erofs_map_blocks *map)
-{
- struct erofs_inode *vi = EROFS_I(inode);
- struct super_block *sb = inode->i_sb;
- bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
- erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;
-
- map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */
- if (map->m_la < erofs_pos(sb, lastblk)) {
- map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
- map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
- } else {
- DBG_BUGON(!tailendpacking);
- map->m_pa = erofs_iloc(inode) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(sb, map->m_la);
- map->m_plen = inode->i_size - map->m_la;
-
- /* inline data should be located in the same meta block */
- if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- map->m_flags |= EROFS_MAP_META;
- }
- return 0;
+ return erofs_bread(buf, offset, need_kmap);
}
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
{
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = inode->i_sb;
+ unsigned int unit, blksz = sb->s_blocksize;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u64 chunknr;
- unsigned int unit;
+ erofs_blk_t startblk, addrmask;
+ bool tailpacking;
erofs_off_t pos;
- void *kaddr;
+ u64 chunknr;
int err = 0;
trace_erofs_map_blocks_enter(inode, map, 0);
map->m_deviceid = 0;
- if (map->m_la >= inode->i_size) {
- /* leave out-of-bound access unmapped */
- map->m_flags = 0;
- map->m_plen = map->m_llen;
+ map->m_flags = 0;
+ if (map->m_la >= inode->i_size)
goto out;
- }
if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
- err = erofs_map_blocks_flatmode(inode, map);
+ tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ if (!tailpacking && vi->startblk == EROFS_NULL_ADDR)
+ goto out;
+ pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking);
+
+ map->m_flags = EROFS_MAP_MAPPED;
+ if (map->m_la < pos) {
+ map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
+ map->m_llen = pos - map->m_la;
+ } else {
+ map->m_pa = erofs_iloc(inode) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_llen = inode->i_size - map->m_la;
+ map->m_flags |= EROFS_MAP_META;
+ }
goto out;
}
@@ -140,78 +118,67 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ idx = erofs_read_metabuf(&buf, sb, pos, true);
+ if (IS_ERR(idx)) {
+ err = PTR_ERR(idx);
goto out;
}
map->m_la = chunknr << vi->chunkbits;
- map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
- round_up(inode->i_size - map->m_la, sb->s_blocksize));
-
- /* handle block map */
- if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr;
-
- if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
- map->m_flags = 0;
- } else {
- map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr));
+ map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ round_up(inode->i_size - map->m_la, blksz));
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
+ addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ?
+ BIT_ULL(48) - 1 : BIT_ULL(32) - 1;
+ startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) |
+ le32_to_cpu(idx->startblk_lo)) & addrmask;
+ if ((startblk ^ EROFS_NULL_ADDR) & addrmask) {
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = erofs_pos(sb, startblk);
+ map->m_flags = EROFS_MAP_MAPPED;
+ }
+ } else {
+ startblk = le32_to_cpu(*(__le32 *)idx);
+ if (startblk != (u32)EROFS_NULL_ADDR) {
+ map->m_pa = erofs_pos(sb, startblk);
map->m_flags = EROFS_MAP_MAPPED;
}
- goto out_unlock;
- }
- /* parse chunk indexes */
- idx = kaddr;
- switch (le32_to_cpu(idx->blkaddr)) {
- case EROFS_NULL_ADDR:
- map->m_flags = 0;
- break;
- default:
- map->m_deviceid = le16_to_cpu(idx->device_id) &
- EROFS_SB(sb)->device_id_mask;
- map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr));
- map->m_flags = EROFS_MAP_MAPPED;
- break;
}
-out_unlock:
erofs_put_metabuf(&buf);
out:
- if (!err)
- map->m_llen = map->m_plen;
+ if (!err) {
+ map->m_plen = map->m_llen;
+ /* inline data should be located in the same meta block */
+ if ((map->m_flags & EROFS_MAP_META) &&
+ erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) {
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ }
trace_erofs_map_blocks_exit(inode, map, 0, err);
return err;
}
static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
- struct erofs_device_info *dif)
+ struct super_block *sb, struct erofs_device_info *dif)
{
+ map->m_sb = sb;
+ map->m_dif = dif;
map->m_bdev = NULL;
- map->m_fp = NULL;
- if (dif->file) {
- if (S_ISBLK(file_inode(dif->file)->i_mode))
- map->m_bdev = file_bdev(dif->file);
- else
- map->m_fp = dif->file;
- }
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
+ map->m_bdev = file_bdev(dif->file);
}
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
- erofs_off_t startoff, length;
+ erofs_off_t startoff;
int id;
- map->m_bdev = sb->s_bdev;
- map->m_daxdev = EROFS_SB(sb)->dax_dev;
- map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
- map->m_fscache = EROFS_SB(sb)->s_fscache;
- map->m_fp = EROFS_SB(sb)->fdev;
-
+ erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
+ map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */
if (map->m_deviceid) {
down_read(&devs->rwsem);
dif = idr_find(&devs->tree, map->m_deviceid - 1);
@@ -220,24 +187,23 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return -ENODEV;
}
if (devs->flatdev) {
- map->m_pa += erofs_pos(sb, dif->mapped_blkaddr);
+ map->m_pa += erofs_pos(sb, dif->uniaddr);
up_read(&devs->rwsem);
return 0;
}
- erofs_fill_from_devinfo(map, dif);
+ erofs_fill_from_devinfo(map, sb, dif);
up_read(&devs->rwsem);
} else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- if (!dif->mapped_blkaddr)
+ if (!dif->uniaddr)
continue;
- startoff = erofs_pos(sb, dif->mapped_blkaddr);
- length = erofs_pos(sb, dif->blocks);
+ startoff = erofs_pos(sb, dif->uniaddr);
if (map->m_pa >= startoff &&
- map->m_pa < startoff + length) {
+ map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
map->m_pa -= startoff;
- erofs_fill_from_devinfo(map, dif);
+ erofs_fill_from_devinfo(map, sb, dif);
break;
}
}
@@ -307,7 +273,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->offset = map.m_la;
if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_daxdev;
+ iomap->dax_dev = mdev.m_dif->dax_dev;
else
iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
@@ -327,16 +293,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = mdev.m_pa;
+ iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dax_part_off;
+ iomap->addr += mdev.m_dif->dax_part_off;
}
return 0;
}
@@ -350,7 +316,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
struct erofs_buf buf = {
.page = kmap_to_page(ptr),
.base = ptr,
- .kmap_type = EROFS_KMAP,
};
DBG_BUGON(iomap->type != IOMAP_INLINE);
@@ -411,22 +376,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = bdev_logical_block_size(bdev) - 1;
- else
- blksize_mask = i_blocksize(inode) - 1;
-
- if ((iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to)) & blksize_mask)
- return -EINVAL;
-
+ if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, NULL, 0);
- }
return filemap_read(iocb, to, 0);
}
@@ -473,8 +425,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
#define erofs_file_mmap generic_file_readonly_mmap
#endif
+static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ const struct iomap_ops *ops = &erofs_iomap_ops;
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+#ifdef CONFIG_EROFS_FS_ZIP
+ ops = &z_erofs_iomap_report_ops;
+#else
+ return generic_file_llseek(file, offset, whence);
+#endif
+
+ if (whence == SEEK_HOLE)
+ offset = iomap_seek_hole(inode, offset, ops);
+ else if (whence == SEEK_DATA)
+ offset = iomap_seek_data(inode, offset, ops);
+ else
+ return generic_file_llseek(file, offset, whence);
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct file_operations erofs_file_fops = {
- .llseek = generic_file_llseek,
+ .llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index eb318c7ddd80..bf62e2836b60 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -7,22 +7,7 @@
#include "compress.h"
#include <linux/lz4.h>
-#ifndef LZ4_DISTANCE_MAX /* history window size */
-#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
-#endif
-
#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
-#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
-#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
-#endif
-
-struct z_erofs_lz4_decompress_ctx {
- struct z_erofs_decompress_req *rq;
- /* # of encoded, decoded pages */
- unsigned int inpages, outpages;
- /* decoded block total length (used for in-place decompression) */
- unsigned int oend;
-};
static int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
@@ -62,10 +47,9 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
* Fill all gaps with bounce pages if it's a sparse page list. Also check if
* all physical pages are consecutive, which can be seen for moderate CR.
*/
-static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
@@ -75,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < ctx->outpages; ++i, ++j) {
+ for (i = j = 0; i < rq->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
@@ -121,36 +105,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
return kaddr ? 1 : 0;
}
-static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq,
void *inpage, void *out, unsigned int *inputmargin,
int *maptype, bool may_inplace)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
- unsigned int omargin, total, i;
+ unsigned int oend, omargin, total, i;
struct page **in;
void *src, *tmp;
if (rq->inplace_io) {
- omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ oend = rq->pageofs_out + rq->outputsize;
+ omargin = PAGE_ALIGN(oend) - oend;
if (rq->partial_decoding || !may_inplace ||
omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
goto docopy;
- for (i = 0; i < ctx->inpages; ++i)
- if (rq->out[ctx->outpages - ctx->inpages + i] !=
+ for (i = 0; i < rq->inpages; ++i)
+ if (rq->out[rq->outpages - rq->inpages + i] !=
rq->in[i])
goto docopy;
kunmap_local(inpage);
*maptype = 3;
- return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT);
+ return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT);
}
- if (ctx->inpages <= 1) {
+ if (rq->inpages <= 1) {
*maptype = 0;
return inpage;
}
kunmap_local(inpage);
- src = erofs_vm_map_ram(rq->in, ctx->inpages);
+ src = erofs_vm_map_ram(rq->in, rq->inpages);
if (!src)
return ERR_PTR(-ENOMEM);
*maptype = 1;
@@ -159,7 +143,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = z_erofs_get_gbuf(ctx->inpages);
+ src = z_erofs_get_gbuf(rq->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_local(inpage);
@@ -204,10 +188,8 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
return 0;
}
-static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
- u8 *dst)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
u8 *out, *headpage, *src;
@@ -231,7 +213,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
}
inputmargin = rq->pageofs_in;
- src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin,
+ src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin,
&maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -258,7 +240,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
if (maptype == 0) {
kunmap_local(headpage);
} else if (maptype == 1) {
- vm_unmap_ram(src, ctx->inpages);
+ vm_unmap_ram(src, rq->inpages);
} else if (maptype == 2) {
z_erofs_put_gbuf(src);
} else if (maptype != 3) {
@@ -271,54 +253,42 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
int ret;
- ctx.rq = rq;
- ctx.oend = rq->pageofs_out + rq->outputsize;
- ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
- ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
-
/* one optimized fast path only for non bigpcluster cases yet */
- if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
+ if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
dst = kmap_local_page(*rq->out);
dst_maptype = 0;
- goto dstmap_out;
- }
-
- /* general decoding path which can be used for all cases */
- ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- dst = page_address(*rq->out);
- dst_maptype = 1;
} else {
- dst = erofs_vm_map_ram(rq->out, ctx.outpages);
- if (!dst)
- return -ENOMEM;
- dst_maptype = 2;
+ /* general decoding path which can be used for all cases */
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, rq->outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
+ }
}
-
-dstmap_out:
- ret = z_erofs_lz4_decompress_mem(&ctx, dst);
+ ret = z_erofs_lz4_decompress_mem(rq, dst);
if (!dst_maptype)
kunmap_local(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, ctx.outpages);
+ vm_unmap_ram(dst, rq->outpages);
return ret;
}
static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages;
const unsigned int bs = rq->sb->s_blocksize;
unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
u8 *kin;
@@ -343,7 +313,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
rq->outputsize -= cur;
}
- for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) {
+ for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) {
insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize);
rq->outputsize -= insz;
if (!rq->in[ni])
@@ -380,7 +350,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
unsigned int j;
if (!dctx->avail_out) {
- if (++dctx->no >= dctx->outpages || !rq->outputsize) {
+ if (++dctx->no >= rq->outpages || !rq->outputsize) {
erofs_err(sb, "insufficient space for decompressed data");
return -EFSCORRUPTED;
}
@@ -408,7 +378,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
}
if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
- if (++dctx->ni >= dctx->inpages) {
+ if (++dctx->ni >= rq->inpages) {
erofs_err(sb, "invalid compressed data");
return -EFSCORRUPTED;
}
@@ -441,7 +411,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
dctx->bounced = true;
}
- for (j = dctx->ni + 1; j < dctx->inpages; ++j) {
+ for (j = dctx->ni + 1; j < rq->inpages; ++j) {
if (rq->out[dctx->no] != rq->in[j])
continue;
tmppage = erofs_allocpage(pgpl, rq->gfp);
diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c
new file mode 100644
index 000000000000..97b77ab64432
--- /dev/null
+++ b/fs/erofs/decompressor_crypto.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/scatterlist.h>
+#include <crypto/acompress.h>
+#include "compress.h"
+
+static int __z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct crypto_acomp *tfm)
+{
+ struct sg_table st_src, st_dst;
+ struct acomp_req *req;
+ struct crypto_wait wait;
+ u8 *headpage;
+ int ret;
+
+ headpage = kmap_local_page(*rq->in);
+ ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ rq->sb->s_blocksize - rq->pageofs_in));
+ kunmap_local(headpage);
+ if (ret)
+ return ret;
+
+ req = acomp_request_alloc(tfm);
+ if (!req)
+ return -ENOMEM;
+
+ ret = sg_alloc_table_from_pages_segment(&st_src, rq->in, rq->inpages,
+ rq->pageofs_in, rq->inputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_src_alloc;
+
+ ret = sg_alloc_table_from_pages_segment(&st_dst, rq->out, rq->outpages,
+ rq->pageofs_out, rq->outputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_dst_alloc;
+
+ acomp_request_set_params(req, st_src.sgl,
+ st_dst.sgl, rq->inputsize, rq->outputsize);
+
+ crypto_init_wait(&wait);
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ ret = crypto_wait_req(crypto_acomp_decompress(req), &wait);
+ if (ret) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, rq->inputsize, rq->pageofs_in, rq->outputsize);
+ ret = -EIO;
+ }
+
+ sg_free_table(&st_dst);
+failed_dst_alloc:
+ sg_free_table(&st_src);
+failed_src_alloc:
+ acomp_request_free(req);
+ return ret;
+}
+
+struct z_erofs_crypto_engine {
+ char *crypto_name;
+ struct crypto_acomp *tfm;
+};
+
+struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = {
+ [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_LZMA] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_DEFLATE] = (struct z_erofs_crypto_engine[]) {
+ { .crypto_name = "qat_deflate", },
+ {},
+ },
+ [Z_EROFS_COMPRESSION_ZSTD] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+};
+static DECLARE_RWSEM(z_erofs_crypto_rwsem);
+
+static struct crypto_acomp *z_erofs_crypto_get_engine(int alg)
+{
+ struct z_erofs_crypto_engine *e;
+
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e)
+ if (e->tfm)
+ return e->tfm;
+ return NULL;
+}
+
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct crypto_acomp *tfm;
+ int i, err;
+
+ down_read(&z_erofs_crypto_rwsem);
+ tfm = z_erofs_crypto_get_engine(rq->alg);
+ if (!tfm) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ for (i = 0; i < rq->outpages; i++) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (!page) {
+ victim = __erofs_allocpage(pgpl, rq->gfp, true);
+ if (!victim) {
+ err = -ENOMEM;
+ goto out;
+ }
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
+ rq->out[i] = victim;
+ }
+ }
+ err = __z_erofs_crypto_decompress(rq, tfm);
+out:
+ up_read(&z_erofs_crypto_rwsem);
+ return err;
+}
+
+int z_erofs_crypto_enable_engine(const char *name, int len)
+{
+ struct z_erofs_crypto_engine *e;
+ struct crypto_acomp *tfm;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!strncmp(name, e->crypto_name, len)) {
+ if (e->tfm)
+ break;
+ tfm = crypto_alloc_acomp(e->crypto_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ up_write(&z_erofs_crypto_rwsem);
+ return -EOPNOTSUPP;
+ }
+ e->tfm = tfm;
+ break;
+ }
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+ return 0;
+}
+
+void z_erofs_crypto_disable_all_engines(void)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ crypto_free_acomp(e->tfm);
+ e->tfm = NULL;
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+}
+
+int z_erofs_crypto_show_engines(char *buf, int size, char sep)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg, len = 0;
+
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ len += scnprintf(buf + len, size - len, "%s%c",
+ e->crypto_name, sep);
+ }
+ }
+ return len;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 5070d2fcc737..6909b2d529c7 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -97,17 +97,11 @@ failed:
return -ENOMEM;
}
-static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int __z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct z_erofs_deflate *strm;
int zerr, err;
@@ -184,6 +178,22 @@ failed_zinit:
return err;
}
+static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ int err;
+
+ if (!rq->partial_decoding) {
+ err = z_erofs_crypto_decompress(rq, pgpl);
+ if (err != -EOPNOTSUPP)
+ return err;
+
+ }
+#endif
+ return __z_erofs_deflate_decompress(rq, pgpl);
+}
+
const struct z_erofs_decompressor z_erofs_deflate_decomp = {
.config = z_erofs_load_deflate_config,
.decompress = z_erofs_deflate_decompress,
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 40666815046f..832cffb83a66 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -150,13 +150,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct xz_buf buf = {};
struct z_erofs_lzma *strm;
enum xz_ret xz_err;
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index 7e177304967e..b4bfe14229f9 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -139,13 +139,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
zstd_in_buffer in_buf = { NULL, 0, 0 };
zstd_out_buffer out_buf = { NULL, 0, 0 };
struct z_erofs_zstd *strm;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index c3b90abdee37..2fae209d0274 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -58,9 +58,9 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, dbstart, EROFS_KMAP);
+ de = erofs_bread(&buf, dbstart, true);
if (IS_ERR(de)) {
- erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
+ erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
@@ -90,6 +90,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
ofs = 0;
}
erofs_put_metabuf(&buf);
+ if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
+ if (!dir_emit_dot(f, ctx))
+ return 0;
+ ++ctx->pos;
+ }
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index c8f2ae845bd2..767fb4acdc93 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -9,6 +9,7 @@
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
+/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
@@ -29,42 +30,37 @@
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
+#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
#define EROFS_ALL_FEATURE_INCOMPAT \
- (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
- EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
- EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
- EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
- EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
- EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
- EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
- EROFS_FEATURE_INCOMPAT_DEDUPE | \
- EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
+ ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1)
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
u8 tag[64]; /* digest(sha256), etc. */
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
- u8 reserved[56];
+ __le32 blocks_lo; /* total blocks count of this device */
+ __le32 uniaddr_lo; /* unified starting block of this device */
+ __le32 blocks_hi; /* total blocks count MSB */
+ __le16 uniaddr_hi; /* unified starting block MSB */
+ u8 reserved[50];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
- __le32 checksum; /* crc32c(super_block) */
+ __le32 checksum; /* crc32c to avoid unexpected on-disk overlap */
__le32 feature_compat;
__u8 blkszbits; /* filesystem block size in bit shift */
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
-
- __le16 root_nid; /* nid of root directory */
+ union {
+ __le16 rootnid_2b; /* nid of root directory */
+ __le16 blocks_hi; /* (48BIT on) blocks count MSB */
+ } __packed rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
-
- __le64 build_time; /* compact inode time derivation */
- __le32 build_time_nsec; /* compact inode time derivation in ns scale */
- __le32 blocks; /* used for statfs */
+ __le64 epoch; /* base seconds used for compact inodes */
+ __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
+ __le32 blocks_lo; /* blocks count LSB */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
__u8 uuid[16]; /* 128-bit uuid for volume */
@@ -83,7 +79,10 @@ struct erofs_super_block {
__le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
__u8 xattr_filter_reserved; /* reserved for xattr name filter */
- __u8 reserved2[23];
+ __u8 reserved[3];
+ __le32 build_time; /* seconds added to epoch for mkfs time */
+ __le64 rootnid_8b; /* (48BIT on) nid of root directory */
+ __u8 reserved2[8];
};
/*
@@ -114,19 +113,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_MASK 0x01
#define EROFS_I_DATALAYOUT_MASK 0x07
-#define EROFS_I_VERSION_BIT 0
-#define EROFS_I_DATALAYOUT_BIT 1
-#define EROFS_I_ALL_BIT 4
-
-#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1)
+#define EROFS_I_VERSION_BIT 0
+#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */
+#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */
+#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
-/* with chunk indexes or just a 4-byte blkaddr array */
+/* with chunk indexes or just a 4-byte block array */
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+#define EROFS_CHUNK_FORMAT_48BIT 0x0040
-#define EROFS_CHUNK_FORMAT_ALL \
- (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)
/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
@@ -139,45 +138,40 @@ struct erofs_inode_chunk_info {
};
union erofs_inode_i_u {
- /* total compressed blocks for compressed inodes */
- __le32 compressed_blocks;
-
- /* block address for uncompressed flat inodes */
- __le32 raw_blkaddr;
-
- /* for device files, used to indicate old/new device # */
- __le32 rdev;
-
- /* for chunk-based files, it contains the summary info */
+ __le32 blocks_lo; /* total blocks count (if compressed inodes) */
+ __le32 startblk_lo; /* starting block number (if flat inodes) */
+ __le32 rdev; /* device ID (if special inodes) */
struct erofs_inode_chunk_info c;
};
+union erofs_inode_i_nb {
+ __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
+ __le16 blocks_hi; /* total blocks count MSB */
+ __le16 startblk_hi; /* starting block number MSB */
+} __packed;
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_nlink;
+ union erofs_inode_i_nb i_nb;
__le32 i_size;
- __le32 i_reserved;
+ __le32 i_mtime;
union erofs_inode_i_u i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
__le16 i_gid;
- __le32 i_reserved2;
+ __le32 i_reserved;
};
/* 64-byte complete form of an ondisk inode */
struct erofs_inode_extended {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_reserved;
+ union erofs_inode_i_nb i_nb;
__le64 i_size;
union erofs_inode_i_u i_u;
@@ -247,6 +241,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
if (!i_xattr_icount)
return 0;
+ /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
return sizeof(struct erofs_xattr_ibody_header) +
sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
}
@@ -265,11 +260,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
-/* 8-byte inode chunk indexes */
+/* 8-byte inode chunk index */
struct erofs_inode_chunk_index {
- __le16 advise; /* always 0, don't care for now */
+ __le16 startblk_hi; /* starting block number MSB */
__le16 device_id; /* back-end storage id (with bits masked) */
- __le32 blkaddr; /* start block address of this inode chunk */
+ __le32 startblk_lo; /* starting block number of this chunk */
};
/* dirent sorts in alphabet order, thus we can do binary search */
@@ -336,21 +331,20 @@ struct z_erofs_zstd_cfgs {
#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE
/*
- * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
- * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
- * (4B) + 2B + (4B) if compacted 2B is on.
- * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
- * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
- * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
- * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
- * bit 5 : fragment pcluster (0 - off; 1 - on)
+ * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes:
+ * 4B (disabled) vs 4B+2B+4B (enabled)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */
+#define Z_EROFS_ADVISE_EXTENTS 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+/* Indicate the record size for each extent if extent metadata is used */
+#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1
+#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3
#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
@@ -362,45 +356,24 @@ struct z_erofs_map_header {
/* indicates the encoded size of tailpacking data */
__le16 h_idata_size;
};
+ __le32 h_extents_lo; /* extent count LSB */
};
__le16 h_advise;
- /*
- * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
- * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
- */
- __u8 h_algorithmtype;
- /*
- * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-6 : reserved;
- * bit 7 : move the whole file into packed inode or not.
- */
- __u8 h_clusterbits;
+ union {
+ struct {
+ /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */
+ __u8 h_algorithmtype;
+ /*
+ * bit 0-3 : logical cluster bits - blkszbits
+ * bit 4-6 : reserved
+ * bit 7 : pack the whole file into packed inode
+ */
+ __u8 h_clusterbits;
+ } __packed;
+ __le16 h_extents_hi; /* extent count MSB */
+ } __packed;
};
-/*
- * On-disk logical cluster type:
- * 0 - literal (uncompressed) lcluster
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * 2 - compressed lcluster (for NONHEAD lclusters)
- *
- * In detail,
- * 0 - literal (uncompressed) lcluster,
- * di_advise = 0
- * di_clusterofs = the literal data offset of the lcluster
- * di_blkaddr = the blkaddr of the literal pcluster
- *
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * di_advise = 1 or 3
- * di_clusterofs = the decompressed data offset of the lcluster
- * di_blkaddr = the blkaddr of the compressed pcluster
- *
- * 2 - compressed lcluster (for NONHEAD lclusters)
- * di_advise = 2
- * di_clusterofs =
- * the decompressed data offset in its own HEAD lcluster
- * di_u.delta[0] = distance to this HEAD lcluster
- * di_u.delta[1] = distance to the next HEAD lcluster
- */
enum {
Z_EROFS_LCLUSTER_TYPE_PLAIN = 0,
Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1,
@@ -414,11 +387,7 @@ enum {
/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
-/*
- * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
- * compressed block count of a compressed extent (in logical clusters, aka.
- * block count of a pcluster).
- */
+/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */
#define Z_EROFS_LI_D0_CBLKCNT (1 << 11)
struct z_erofs_lcluster_index {
@@ -427,19 +396,36 @@ struct z_erofs_lcluster_index {
__le16 di_clusterofs;
union {
- /* for the HEAD lclusters */
- __le32 blkaddr;
+ __le32 blkaddr; /* for the HEAD lclusters */
/*
- * for the NONHEAD lclusters
* [0] - distance to its HEAD lcluster
* [1] - distance to the next HEAD lcluster
*/
- __le16 delta[2];
+ __le16 delta[2]; /* for the NONHEAD lclusters */
} di_u;
};
-#define Z_EROFS_FULL_INDEX_ALIGN(end) \
- (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8)
+#define Z_EROFS_MAP_HEADER_END(end) \
+ (ALIGN(end, 8) + sizeof(struct z_erofs_map_header))
+#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8)
+
+#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27)
+#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28
+#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1)
+struct z_erofs_extent {
+ __le32 plen; /* encoded length */
+ __le32 pstart_lo; /* physical offset */
+ __le32 pstart_hi; /* physical offset MSB */
+ __le32 lstart_lo; /* logical offset */
+ __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */
+ __u8 reserved[12]; /* for future use */
+};
+
+static inline int z_erofs_extent_recsize(unsigned int advise)
+{
+ return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) &
+ Z_EROFS_ADVISE_EXTRECSZ_MASK);
+}
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index 3af96b1e2c2a..df5cc63f2c01 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -6,9 +6,10 @@
#include <trace/events/erofs.h>
struct erofs_fileio_rq {
- struct bio_vec bvecs[BIO_MAX_VECS];
+ struct bio_vec bvecs[16];
struct bio bio;
struct kiocb iocb;
+ struct super_block *sb;
};
struct erofs_fileio {
@@ -31,6 +32,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
ret = 0;
}
if (rq->bio.bi_end_io) {
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
rq->bio.bi_end_io(&rq->bio);
} else {
bio_for_each_folio_all(fi, &rq->bio) {
@@ -44,6 +47,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
{
+ const struct cred *old_cred;
struct iov_iter iter;
int ret;
@@ -52,11 +56,14 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT;
rq->iocb.ki_ioprio = get_current_ioprio();
rq->iocb.ki_complete = erofs_fileio_ki_complete;
- rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ?
- IOCB_DIRECT : 0;
+ if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) &&
+ rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT)
+ rq->iocb.ki_flags = IOCB_DIRECT;
iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
rq->bio.bi_iter.bi_size);
+ old_cred = override_creds(rq->iocb.ki_filp->f_cred);
ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ revert_creds(old_cred);
if (ret != -EIOCBQUEUED)
erofs_fileio_ki_complete(&rq->iocb, ret);
}
@@ -66,8 +73,9 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq),
GFP_KERNEL | __GFP_NOFAIL);
- bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ);
- rq->iocb.ki_filp = mdev->m_fp;
+ bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ);
+ rq->iocb.ki_filp = mdev->m_dif->file;
+ rq->sb = mdev->m_sb;
return rq;
}
@@ -109,7 +117,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
void *src;
src = erofs_read_metabuf(&buf, inode->i_sb,
- map->m_pa + ofs, EROFS_KMAP);
+ map->m_pa + ofs, true);
if (IS_ERR(src)) {
err = PTR_ERR(src);
break;
@@ -142,13 +150,14 @@ io_retry:
if (err)
break;
io->rq = erofs_fileio_rq_alloc(&io->dev);
- io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;
+ io->rq->bio.bi_iter.bi_sector =
+ (io->dev.m_dif->fsoff + io->dev.m_pa) >> 9;
attached = 0;
}
- if (!attached++)
- erofs_onlinefolio_split(folio);
if (!bio_add_folio(&io->rq->bio, folio, len, cur))
goto io_retry;
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
io->dev.m_pa += len;
}
cur += len;
@@ -175,7 +184,7 @@ static void erofs_fileio_readahead(struct readahead_control *rac)
struct folio *folio;
int err;
- trace_erofs_readpages(inode, readahead_index(rac),
+ trace_erofs_readahead(inode, readahead_index(rac),
readahead_count(rac), true);
while ((folio = readahead_folio(rac))) {
err = erofs_fileio_scan_folio(&io, folio);
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index fda16eedafb5..34517ca9df91 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
erofs_fscache_req_put(req);
}
-static void erofs_fscache_req_end_io(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_io *io = priv;
struct erofs_fscache_rq *req = io->private;
@@ -180,8 +179,7 @@ struct erofs_fscache_bio {
struct bio_vec bvecs[BIO_MAX_VECS];
};
-static void erofs_fscache_bio_endio(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_bio *io = priv;
@@ -198,7 +196,7 @@ struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
- io->io.private = mdev->m_fscache->cookie;
+ io->io.private = mdev->m_dif->fscache->cookie;
io->io.end_io = erofs_fscache_bio_endio;
refcount_set(&io->io.ref, 1);
return &io->bio;
@@ -276,7 +274,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
size_t size = map.m_llen;
void *src;
- src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa, true);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -316,7 +314,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
if (!io)
return -ENOMEM;
iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
- ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie,
+ ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie,
mdev.m_pa + (pos - map.m_la), io);
erofs_fscache_req_io_put(io);
@@ -657,7 +655,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
if (IS_ERR(fscache))
return PTR_ERR(fscache);
- sbi->s_fscache = fscache;
+ sbi->dif0.fscache = fscache;
return 0;
}
@@ -665,14 +663,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- erofs_fscache_unregister_cookie(sbi->s_fscache);
+ erofs_fscache_unregister_cookie(sbi->dif0.fscache);
if (sbi->domain)
erofs_fscache_domain_put(sbi->domain);
else
fscache_relinquish_volume(sbi->volume, NULL, false);
- sbi->s_fscache = NULL;
+ sbi->dif0.fscache = NULL;
sbi->volume = NULL;
sbi->domain = NULL;
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index db29190656eb..a0ae0b4f7b01 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -27,29 +27,27 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode));
+ unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode));
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ erofs_blk_t addrmask = BIT_ULL(48) - 1;
struct erofs_inode *vi = EROFS_I(inode);
- const erofs_off_t inode_loc = erofs_iloc(inode);
- erofs_blk_t blkaddr, nblks = 0;
- void *kaddr;
+ struct erofs_inode_extended *die, copied;
struct erofs_inode_compact *dic;
- struct erofs_inode_extended *die, *copied = NULL;
- union erofs_inode_i_u iu;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- unsigned int ifmt, ofs;
+ unsigned int ifmt;
+ void *ptr;
int err = 0;
- blkaddr = erofs_blknr(sb, inode_loc);
- ofs = erofs_blkoff(sb, inode_loc);
-
- kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(kaddr));
- return PTR_ERR(kaddr);
+ ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inode (nid: %llu) page, err %d",
+ vi->nid, err);
+ goto err_out;
}
- dic = kaddr + ofs;
+ dic = ptr + ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
erofs_err(sb, "unsupported i_format %u of nid %llu",
@@ -73,40 +71,34 @@ static int erofs_read_inode(struct inode *inode)
if (ofs + vi->inode_isize <= sb->s_blocksize) {
ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
+ copied.i_u = die->i_u;
+ copied.i_nb = die->i_nb;
} else {
const unsigned int gotten = sb->s_blocksize - ofs;
- copied = kmalloc(vi->inode_isize, GFP_KERNEL);
- if (!copied) {
- err = -ENOMEM;
+ memcpy(&copied, dic, gotten);
+ ptr = erofs_read_metabuf(&buf, sb,
+ erofs_pos(sb, blkaddr + 1), true);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d",
+ vi->nid, err);
goto err_out;
}
- memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1),
- EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
- vi->nid, PTR_ERR(kaddr));
- kfree(copied);
- return PTR_ERR(kaddr);
- }
ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, ofs);
- die = copied;
+ memcpy((u8 *)&copied + gotten, ptr, ofs);
+ die = &copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- iu = die->i_u;
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
- /* each extended inode has its own timestamp */
- inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
+ inode_set_mtime(inode, le64_to_cpu(die->i_mtime),
le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
- kfree(copied);
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
@@ -114,12 +106,20 @@ static int erofs_read_inode(struct inode *inode)
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- iu = dic->i_u;
+ copied.i_u = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
- set_nlink(inode, le16_to_cpu(dic->i_nlink));
- /* use build time for compact inodes */
- inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
+ if (!S_ISDIR(inode->i_mode) &&
+ ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) {
+ set_nlink(inode, 1);
+ copied.i_nb = dic->i_nb;
+ } else {
+ set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
+ copied.i_nb.startblk_hi = 0;
+ addrmask = BIT_ULL(32) - 1;
+ }
+ inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime),
+ sbi->fixed_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
break;
@@ -136,19 +136,26 @@ static int erofs_read_inode(struct inode *inode)
goto err_out;
}
switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
case S_IFDIR:
+ vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1;
+ fallthrough;
+ case S_IFREG:
case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) |
+ ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32);
+ if (vi->datalayout == EROFS_INODE_FLAT_PLAIN &&
+ !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask))
+ vi->startblk = EROFS_NULL_ADDR;
+
if(S_ISLNK(inode->i_mode)) {
- err = erofs_fill_symlink(inode, kaddr, ofs);
+ err = erofs_fill_symlink(inode, ptr, ofs);
if (err)
goto err_out;
}
break;
case S_IFCHR:
case S_IFBLK:
- inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+ inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev));
break;
case S_IFIFO:
case S_IFSOCK:
@@ -161,12 +168,15 @@ static int erofs_read_inode(struct inode *inode)
goto err_out;
}
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout)) {
- nblks = le32_to_cpu(iu.compressed_blocks);
- } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) <<
+ (sb->s_blocksize_bits - 9);
+ else
+ inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
+
+ if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
/* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(iu.c.format);
+ vi->chunkformat = le16_to_cpu(copied.i_u.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
@@ -176,22 +186,15 @@ static int erofs_read_inode(struct inode *inode)
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode_set_mtime_to_ts(inode,
- inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
+ inode_set_atime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, inode_get_mtime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
(vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
vi->datalayout == EROFS_INODE_CHUNK_BASED))
inode->i_flags |= S_DAX;
-
- if (!nblks)
- /* measure inode.i_blocks as generic filesystems */
- inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
- else
- inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
err_out:
- DBG_BUGON(err);
erofs_put_metabuf(&buf);
return err;
}
@@ -202,13 +205,10 @@ static int erofs_fill_inode(struct inode *inode)
int err;
trace_erofs_fill_inode(inode);
-
- /* read inode base data from disk */
err = erofs_read_inode(inode);
if (err)
return err;
- /* setup the new inode */
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
@@ -229,15 +229,10 @@ static int erofs_fill_inode(struct inode *inode)
inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
+ default:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
return 0;
- default:
- return -EFSCORRUPTED;
}
mapping_set_large_folios(inode->i_mapping);
@@ -318,6 +313,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
+ struct block_device *bdev = inode->i_sb->s_bdev;
bool compressed =
erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
@@ -330,15 +326,14 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
/*
* Return the DIO alignment restrictions if requested.
*
- * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and
- * compressed files, so in these cases we report no DIO support.
+ * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode
+ * and uncompressed inodes, otherwise we report no DIO support.
*/
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
stat->result_mask |= STATX_DIOALIGN;
- if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) {
- stat->dio_mem_align =
- bdev_logical_block_size(inode->i_sb->s_bdev);
- stat->dio_offset_align = stat->dio_mem_align;
+ if (bdev && !compressed) {
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
}
}
generic_fillattr(idmap, request_mask, inode, stat);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4efd578d7c62..a32c03a80c70 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -20,18 +20,12 @@
#include <linux/iomap.h>
#include "erofs_fs.h"
-/* redefine pr_fmt "erofs: " */
-#undef pr_fmt
-#define pr_fmt(fmt) "erofs: " fmt
-
-__printf(3, 4) void _erofs_err(struct super_block *sb,
- const char *function, const char *fmt, ...);
+__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
#define erofs_err(sb, fmt, ...) \
- _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
-__printf(3, 4) void _erofs_info(struct super_block *sb,
- const char *function, const char *fmt, ...);
+ _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__)
#define erofs_info(sb, fmt, ...) \
- _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+ _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__)
+
#ifdef CONFIG_EROFS_FS_DEBUG
#define DBG_BUGON BUG_ON
#else
@@ -43,18 +37,17 @@ __printf(3, 4) void _erofs_info(struct super_block *sb,
typedef u64 erofs_nid_t;
typedef u64 erofs_off_t;
-/* data type for filesystem-wide blocks number */
-typedef u32 erofs_blk_t;
+typedef u64 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
struct file *file;
struct dax_device *dax_dev;
- u64 dax_part_off;
+ u64 fsoff, dax_part_off;
- u32 blocks;
- u32 mapped_blkaddr;
+ erofs_blk_t blocks;
+ erofs_blk_t uniaddr;
};
enum {
@@ -113,6 +106,7 @@ struct erofs_xattr_prefix_item {
};
struct erofs_sb_info {
+ struct erofs_device_info dif0;
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
@@ -130,13 +124,9 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
- struct file *fdev;
struct inode *packed_inode;
struct erofs_dev_context *devs;
- struct dax_device *dax_dev;
- u64 dax_part_off;
u64 total_blocks;
- u32 primarydevice_blocks;
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
@@ -152,8 +142,8 @@ struct erofs_sb_info {
unsigned char blkszbits; /* filesystem block size in bit shift */
u32 sb_size; /* total superblock size */
- u32 build_time_nsec;
- u64 build_time;
+ u32 fixed_nsec;
+ s64 epoch;
/* what we really care is nid, rather than ino.. */
erofs_nid_t root_nid;
@@ -161,8 +151,6 @@ struct erofs_sb_info {
/* used for statfs, f_files - f_favail */
u64 inos;
- u8 uuid[16]; /* 128-bit uuid for volume */
- u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
@@ -172,7 +160,6 @@ struct erofs_sb_info {
/* fscache support */
struct fscache_volume *volume;
- struct erofs_fscache *s_fscache;
struct erofs_domain *domain;
char *fsid;
char *domain_id;
@@ -186,6 +173,7 @@ struct erofs_sb_info {
#define EROFS_MOUNT_POSIX_ACL 0x00000020
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
+#define EROFS_MOUNT_DIRECT_IO 0x00000100
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
@@ -193,7 +181,7 @@ struct erofs_sb_info {
static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
{
- return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev;
+ return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file;
}
static inline bool erofs_is_fscache_mode(struct super_block *sb)
@@ -208,27 +196,17 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-/* basic unit of the workstation of a super_block */
-struct erofs_workgroup {
- pgoff_t index;
- struct lockref lockref;
-};
-
-enum erofs_kmap_type {
- EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap_local_page() to map the buffer */
-};
-
struct erofs_buf {
struct address_space *mapping;
+ struct file *file;
+ u64 off;
struct page *page;
void *base;
- enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
-#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
+#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits))
+#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
@@ -248,6 +226,7 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
+EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
@@ -267,6 +246,7 @@ struct erofs_inode {
unsigned char datalayout;
unsigned char inode_isize;
+ bool dot_omitted;
unsigned int xattr_isize;
unsigned int xattr_name_filter;
@@ -274,7 +254,7 @@ struct erofs_inode {
unsigned int *xattr_shared_xattrs;
union {
- erofs_blk_t raw_blkaddr;
+ erofs_blk_t startblk;
struct {
unsigned short chunkformat;
unsigned char chunkbits;
@@ -283,15 +263,13 @@ struct erofs_inode {
struct {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
- unsigned char z_logical_clusterbits;
- unsigned long z_tailextent_headlcn;
+ unsigned char z_lclusterbits;
union {
- struct {
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
- };
- erofs_off_t z_fragmentoff;
+ u64 z_tailextent_headlcn;
+ u64 z_extents;
};
+ erofs_off_t z_fragmentoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -369,11 +347,9 @@ enum {
};
struct erofs_map_dev {
- struct erofs_fscache *m_fscache;
+ struct super_block *m_sb;
+ struct erofs_device_info *m_dif;
struct block_device *m_bdev;
- struct dax_device *m_daxdev;
- struct file *m_fp;
- u64 m_dax_part_off;
erofs_off_t m_pa;
unsigned int m_deviceid;
@@ -404,11 +380,10 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
- enum erofs_kmap_type type);
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap);
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, enum erofs_kmap_type type);
+ erofs_off_t offset, bool need_kmap);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -456,20 +431,18 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-void erofs_workgroup_put(struct erofs_workgroup *grp);
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index);
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+
+extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_subsystem(void);
void z_erofs_exit_subsystem(void);
-int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
+int z_erofs_init_super(struct super_block *sb);
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *z_erofs_get_gbuf(unsigned int requiredpages);
@@ -477,7 +450,6 @@ void z_erofs_put_gbuf(void *ptr);
int z_erofs_gbuf_growsize(unsigned int nrpages);
int __init z_erofs_gbuf_init(void);
void z_erofs_gbuf_exit(void);
-int erofs_init_managed_cache(struct super_block *sb);
int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
@@ -486,7 +458,7 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_subsystem(void) { return 0; }
static inline void z_erofs_exit_subsystem(void) {}
-static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+static inline int z_erofs_init_super(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index c94d0c1608a8..f7cf4f41af28 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -100,7 +100,7 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_dirent *de;
buf.mapping = dir->i_mapping;
- de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 320d586c3896..e1e9f06e8342 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -18,65 +18,42 @@
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
+void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
-
- if (sb)
- pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
- else
- pr_err("%s: %pV", func, &vaf);
- va_end(args);
-}
-
-void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
if (sb)
- pr_info("(device %s): %pV", sb->s_id, &vaf);
+ printk("%c%cerofs (device %s): %pV",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
- pr_info("%pV", &vaf);
+ printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
{
- size_t len = 1 << EROFS_SB(sb)->blkszbits;
- struct erofs_super_block *dsb;
- u32 expected_crc, crc;
+ struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET;
+ u32 len = 1 << EROFS_SB(sb)->blkszbits, crc;
if (len > EROFS_SUPER_OFFSET)
len -= EROFS_SUPER_OFFSET;
+ len -= offsetof(struct erofs_super_block, checksum) +
+ sizeof(dsb->checksum);
- dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL);
- if (!dsb)
- return -ENOMEM;
-
- expected_crc = le32_to_cpu(dsb->checksum);
- dsb->checksum = 0;
- /* to allow for x86 boot sectors and other oddities. */
- crc = crc32c(~0, dsb, len);
- kfree(dsb);
-
- if (crc != expected_crc) {
- erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
- crc, expected_crc);
- return -EBADMSG;
- }
- return 0;
+ /* skip .magic(pre-verified) and .checksum(0) fields */
+ crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len);
+ if (crc == le32_to_cpu(dsb->checksum))
+ return 0;
+ erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+ crc, le32_to_cpu(dsb->checksum));
+ return -EBADMSG;
}
static void erofs_inode_init_once(void *ptr)
@@ -117,7 +94,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, *offset, EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr))
return ptr;
@@ -133,7 +110,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, *offset, EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
@@ -164,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_deviceslot *dis;
struct file *file;
- dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
+ dis = erofs_read_metabuf(buf, sb, *pos, true);
if (IS_ERR(dis))
return PTR_ERR(dis);
@@ -188,8 +165,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
bdev_file_open_by_path(dif->path,
BLK_OPEN_READ, sb->s_type, NULL);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ if (file == ERR_PTR(-ENOTBLK))
+ return -EINVAL;
return PTR_ERR(file);
+ }
if (!erofs_is_fileio_mode(sbi)) {
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
@@ -201,8 +181,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
dif->file = file;
}
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ dif->blocks = le32_to_cpu(dis->blocks_lo);
+ dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
sbi->total_blocks += dif->blocks;
*pos += EROFS_DEVT_SLOT_SIZE;
return 0;
@@ -218,7 +198,7 @@ static int erofs_scan_devices(struct super_block *sb,
struct erofs_device_info *dif;
int id, err = 0;
- sbi->total_blocks = sbi->primarydevice_blocks;
+ sbi->total_blocks = sbi->dif0.blocks;
if (!erofs_sb_has_device_table(sbi))
ondisk_extradevs = 0;
else
@@ -278,7 +258,7 @@ static int erofs_read_superblock(struct super_block *sb)
void *data;
int ret;
- data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ data = erofs_read_metabuf(&buf, sb, 0, true);
if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
return PTR_ERR(data);
@@ -291,7 +271,7 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
}
- sbi->blkszbits = dsb->blkszbits;
+ sbi->blkszbits = dsb->blkszbits;
if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) {
erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits);
goto out;
@@ -322,7 +302,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+ sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -331,23 +311,20 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
- sbi->root_nid = le16_to_cpu(dsb->root_nid);
+ if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+ sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+ sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
+ le16_to_cpu(dsb->rb.blocks_hi);
+ } else {
+ sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+ }
sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
sbi->inos = le64_to_cpu(dsb->inos);
- sbi->build_time = le64_to_cpu(dsb->build_time);
- sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
-
+ sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
+ sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
- ret = strscpy(sbi->volume_name, dsb->volume_name,
- sizeof(dsb->volume_name));
- if (ret < 0) { /* -E2BIG */
- erofs_err(sb, "bad volume name without NIL terminator");
- ret = -EFSCORRUPTED;
- goto out;
- }
-
/* parse on-disk compression configurations */
ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
@@ -356,6 +333,8 @@ static int erofs_read_superblock(struct super_block *sb)
/* handle multiple devices */
ret = erofs_scan_devices(sb, dsb);
+ if (erofs_sb_has_48bit(sbi))
+ erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
@@ -379,15 +358,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
}
enum {
- Opt_user_xattr,
- Opt_acl,
- Opt_cache_strategy,
- Opt_dax,
- Opt_dax_enum,
- Opt_device,
- Opt_fsid,
- Opt_domain_id,
- Opt_err
+ Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
+ Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
};
static const struct constant_table erofs_param_cache_strategy[] = {
@@ -413,6 +385,8 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("device", Opt_device),
fsparam_string("fsid", Opt_fsid),
fsparam_string("domain_id", Opt_domain_id),
+ fsparam_flag_no("directio", Opt_directio),
+ fsparam_u64("fsoffset", Opt_fsoffset),
{}
};
@@ -526,30 +500,69 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
break;
#endif
- default:
- return -ENOPARAM;
+ case Opt_directio:
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (result.boolean)
+ set_opt(&sbi->opt, DIRECT_IO);
+ else
+ clear_opt(&sbi->opt, DIRECT_IO);
+#else
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+ break;
+ case Opt_fsoffset:
+ sbi->dif0.fsoff = result.uint_64;
+ break;
}
return 0;
}
-static struct inode *erofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
- return erofs_iget(sb, ino);
+ erofs_nid_t nid = EROFS_I(inode)->nid;
+ int len = parent ? 6 : 3;
+
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ fh[0] = (u32)(nid >> 32);
+ fh[1] = (u32)(nid & 0xffffffff);
+ fh[2] = inode->i_generation;
+
+ if (parent) {
+ nid = EROFS_I(parent)->nid;
+
+ fh[3] = (u32)(nid >> 32);
+ fh[4] = (u32)(nid & 0xffffffff);
+ fh[5] = parent->i_generation;
+ }
+
+ *max_len = len;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[0] << 32) | fid->raw[1]));
}
static struct dentry *erofs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[3] << 32) | fid->raw[4]));
}
static struct dentry *erofs_get_parent(struct dentry *child)
@@ -565,7 +578,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
- .encode_fh = generic_encode_ino32_fh,
+ .encode_fh = erofs_encode_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
@@ -617,9 +630,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -EINVAL;
}
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
- &sbi->dax_part_off,
- NULL, NULL);
+ sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dif0.dax_part_off, NULL, NULL);
}
err = erofs_read_superblock(sb);
@@ -631,14 +643,26 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
errorfc(fc, "unsupported blksize for fscache mode");
return -EINVAL;
}
- if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_blocksize = 1 << sbi->blkszbits;
+ sb->s_blocksize_bits = sbi->blkszbits;
+ } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
errorfc(fc, "failed to set erofs blksize");
return -EINVAL;
}
}
+ if (sbi->dif0.fsoff) {
+ if (sbi->dif0.fsoff & (sb->s_blocksize - 1))
+ return invalfc(fc, "fsoffset %llu is not aligned to block size %lu",
+ sbi->dif0.fsoff, sb->s_blocksize);
+ if (erofs_is_fscache_mode(sb))
+ return invalfc(fc, "cannot use fsoffset in fscache mode");
+ }
+
if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- if (!sbi->dax_dev) {
+ if (!sbi->dif0.dax_dev) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
clear_opt(&sbi->opt, DAX_ALWAYS);
} else if (sbi->blkszbits != PAGE_SHIFT) {
@@ -656,9 +680,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
else
sb->s_flags &= ~SB_POSIXACL;
-#ifdef CONFIG_EROFS_FS_ZIP
- xa_init(&sbi->managed_pslots);
-#endif
+ err = z_erofs_init_super(sb);
+ if (err)
+ return err;
+
+ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
+ inode = erofs_iget(sb, sbi->packed_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->packed_inode = inode;
+ }
inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
@@ -670,24 +701,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
iput(inode);
return -EINVAL;
}
-
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
erofs_shrinker_register(sb);
- if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
- sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
- if (IS_ERR(sbi->packed_inode)) {
- err = PTR_ERR(sbi->packed_inode);
- sbi->packed_inode = NULL;
- return err;
- }
- }
- err = erofs_init_managed_cache(sb);
- if (err)
- return err;
-
err = erofs_xattr_prefixes_init(sb);
if (err)
return err;
@@ -709,19 +727,23 @@ static int erofs_fc_get_tree(struct fs_context *fc)
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- ret = get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
+ IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
+ GET_TREE_BDEV_QUIET_LOOKUP : 0);
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
if (ret == -ENOTBLK) {
+ struct file *file;
+
if (!fc->source)
return invalf(fc, "No source specified");
- sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
- if (IS_ERR(sbi->fdev))
- return PTR_ERR(sbi->fdev);
+ file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ sbi->dif0.file = file;
- if (S_ISREG(file_inode(sbi->fdev)->i_mode) &&
- sbi->fdev->f_mapping->a_ops->read_folio)
+ if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) &&
+ sbi->dif0.file->f_mapping->a_ops->read_folio)
return get_tree_nodev(fc, erofs_fc_fill_super);
- fput(sbi->fdev);
}
#endif
return ret;
@@ -772,19 +794,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs)
kfree(devs);
}
-static void erofs_fc_free(struct fs_context *fc)
+static void erofs_sb_free(struct erofs_sb_info *sbi)
{
- struct erofs_sb_info *sbi = fc->s_fs_info;
-
- if (!sbi)
- return;
-
erofs_free_dev_context(sbi->devs);
kfree(sbi->fsid);
kfree(sbi->domain_id);
+ if (sbi->dif0.file)
+ fput(sbi->dif0.file);
kfree(sbi);
}
+static void erofs_fc_free(struct fs_context *fc)
+{
+ struct erofs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi) /* free here if an error occurs before transferring to sb */
+ erofs_sb_free(sbi);
+}
+
static const struct fs_context_operations erofs_context_ops = {
.parse_param = erofs_fc_parse_param,
.get_tree = erofs_fc_get_tree,
@@ -814,23 +841,29 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
+static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
+{
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+}
+
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev)
+ if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) ||
+ sbi->dif0.file)
kill_anon_super(sb);
else
kill_block_super(sb);
-
- erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev, NULL);
+ erofs_drop_internal_inodes(sbi);
+ fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
- kfree(sbi->fsid);
- kfree(sbi->domain_id);
- if (sbi->fdev)
- fput(sbi->fdev);
- kfree(sbi);
+ erofs_sb_free(sbi);
sb->s_fs_info = NULL;
}
@@ -838,17 +871,10 @@ static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
- DBG_BUGON(!sbi);
-
erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
erofs_xattr_prefixes_cleanup(sb);
-#ifdef CONFIG_EROFS_FS_ZIP
- iput(sbi->managed_cache);
- sbi->managed_cache = NULL;
-#endif
- iput(sbi->packed_inode);
- sbi->packed_inode = NULL;
+ erofs_drop_internal_inodes(sbi);
erofs_free_dev_context(sbi->devs);
sbi->devs = NULL;
erofs_fscache_unregister_fs(sb);
@@ -956,12 +982,16 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+ if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO))
+ seq_puts(seq, ",directio");
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (sbi->fsid)
seq_printf(seq, ",fsid=%s", sbi->fsid);
if (sbi->domain_id)
seq_printf(seq, ",domain_id=%s", sbi->domain_id);
#endif
+ if (sbi->dif0.fsoff)
+ seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 63cffd0fd261..eed8797a193f 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -7,11 +7,14 @@
#include <linux/kobject.h>
#include "internal.h"
+#include "compress.h"
enum {
attr_feature,
+ attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
+ attr_accel,
};
enum {
@@ -57,11 +60,24 @@ static struct erofs_attr erofs_attr_##_name = { \
#ifdef CONFIG_EROFS_FS_ZIP
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+EROFS_ATTR_FUNC(drop_caches, 0200);
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+EROFS_ATTR_FUNC(accel, 0644);
#endif
-static struct attribute *erofs_attrs[] = {
+static struct attribute *erofs_sb_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
+ ATTR_LIST(drop_caches),
+#endif
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_sb);
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ ATTR_LIST(accel),
#endif
NULL,
};
@@ -78,6 +94,7 @@ EROFS_ATTR_FEATURE(sb_chksum);
EROFS_ATTR_FEATURE(ztailpacking);
EROFS_ATTR_FEATURE(fragments);
EROFS_ATTR_FEATURE(dedupe);
+EROFS_ATTR_FEATURE(48bit);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -90,6 +107,7 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(ztailpacking),
ATTR_LIST(fragments),
ATTR_LIST(dedupe),
+ ATTR_LIST(48bit),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
@@ -123,12 +141,14 @@ static ssize_t erofs_attr_show(struct kobject *kobj,
if (!ptr)
return 0;
return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ case attr_accel:
+ return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n');
}
return 0;
}
static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
+ const char *buf, size_t len)
{
struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
s_kobj);
@@ -163,6 +183,33 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
return -EINVAL;
*(bool *)ptr = !!t;
return len;
+#ifdef CONFIG_EROFS_FS_ZIP
+ case attr_drop_caches:
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t < 1 || t > 3)
+ return -EINVAL;
+
+ if (t & 2)
+ z_erofs_shrink_scan(sbi, ~0UL);
+ if (t & 1)
+ invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
+ return len;
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ case attr_accel:
+ buf = skip_spaces(buf);
+ z_erofs_crypto_disable_all_engines();
+ while (*buf) {
+ t = strcspn(buf, "\n");
+ ret = z_erofs_crypto_enable_engine(buf, t);
+ if (ret < 0)
+ return ret;
+ buf += buf[t] != '\0' ? t + 1 : t;
+ }
+ return len;
+#endif
}
return 0;
}
@@ -180,12 +227,13 @@ static const struct sysfs_ops erofs_attr_ops = {
};
static const struct kobj_type erofs_sb_ktype = {
- .default_groups = erofs_groups,
+ .default_groups = erofs_sb_groups,
.sysfs_ops = &erofs_attr_ops,
.release = erofs_sb_release,
};
static const struct kobj_type erofs_ktype = {
+ .default_groups = erofs_groups,
.sysfs_ops = &erofs_attr_ops,
};
@@ -229,6 +277,12 @@ void erofs_unregister_sysfs(struct super_block *sb)
}
}
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
+
int __init erofs_init_sysfs(void)
{
int ret;
@@ -236,24 +290,12 @@ int __init erofs_init_sysfs(void)
kobject_set_name(&erofs_root.kobj, "erofs");
erofs_root.kobj.parent = fs_kobj;
ret = kset_register(&erofs_root);
- if (ret)
- goto root_err;
-
- ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
- NULL, "features");
- if (ret)
- goto feat_err;
- return ret;
-
-feat_err:
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-root_err:
+ if (!ret) {
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (!ret)
+ return 0;
+ erofs_exit_sysfs();
+ }
return ret;
}
-
-void erofs_exit_sysfs(void)
-{
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index a90d7d649739..9cf84717a92e 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -81,7 +81,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
@@ -102,7 +102,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
@@ -183,7 +183,7 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -286,7 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -330,7 +330,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -367,7 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
for (i = 0; i < vi->xattr_shared_count; ++i) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -407,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
}
it.index = index;
- it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ it.name = QSTR(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
@@ -478,7 +478,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!sbi->xattr_prefix_count)
return 0;
- pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL);
+ pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL);
if (!pfs)
return -ENOMEM;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a569ff9dfd04..fe8071844724 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -12,12 +12,6 @@
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
#define Z_EROFS_INLINE_BVECS 2
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
struct z_erofs_bvec {
struct page *page;
int offset;
@@ -44,11 +38,14 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_pcluster {
- struct erofs_workgroup obj;
struct mutex lock;
+ struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
+ struct z_erofs_pcluster *next;
+
+ /* I: start physical position of this pcluster */
+ erofs_off_t pos;
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -76,12 +73,12 @@ struct z_erofs_pcluster {
/* I: compression algorithm format */
unsigned char algorithmformat;
+ /* I: whether compressed data is in-lined or not */
+ bool from_meta;
+
/* L: whether partial decompression or not */
bool partial;
- /* L: indicate several pageofs_outs or not */
- bool multibases;
-
/* L: whether extra buffer allocations are best-effort */
bool besteffort;
@@ -91,12 +88,11 @@ struct z_erofs_pcluster {
/* the end of a chain of pclusters */
#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
-#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
struct super_block *sb;
+ struct z_erofs_pcluster *head;
atomic_t pending_bios;
- z_erofs_next_pcluster_t head;
union {
struct completion done;
@@ -106,17 +102,11 @@ struct z_erofs_decompressqueue {
bool eio, sync;
};
-static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
-{
- return !pcl->obj.index;
-}
-
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
{
- return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
+ return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
{
return fo->mapping == MNGD_MAPPING(sbi);
@@ -138,7 +128,7 @@ struct z_erofs_pcluster_slab {
static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
_PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
- _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1)
};
struct z_erofs_bvec_iter {
@@ -272,7 +262,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
- pcl->pclustersize = size;
return pcl;
}
return ERR_PTR(-EINVAL);
@@ -299,6 +288,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly;
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
static struct kthread_worker __rcu **z_erofs_pcpu_workers;
+static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);
static void erofs_destroy_percpu_workers(void)
{
@@ -318,7 +308,7 @@ static void erofs_destroy_percpu_workers(void)
static struct kthread_worker *erofs_init_percpu_worker(int cpu)
{
struct kthread_worker *worker =
- kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+ kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
if (IS_ERR(worker))
return worker;
@@ -344,12 +334,8 @@ static int erofs_init_percpu_workers(void)
}
return 0;
}
-#else
-static inline void erofs_destroy_percpu_workers(void) {}
-static inline int erofs_init_percpu_workers(void) { return 0; }
-#endif
-#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
+#ifdef CONFIG_HOTPLUG_CPU
static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
static enum cpuhp_state erofs_cpuhp_state;
@@ -406,17 +392,56 @@ static void erofs_cpu_hotplug_destroy(void)
if (erofs_cpuhp_state)
cpuhp_remove_state_nocalls(erofs_cpuhp_state);
}
-#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
+#else /* !CONFIG_HOTPLUG_CPU */
static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
-#endif
+#endif/* CONFIG_HOTPLUG_CPU */
+static int z_erofs_init_pcpu_workers(struct super_block *sb)
+{
+ int err;
-void z_erofs_exit_subsystem(void)
+ if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
+ return 0;
+
+ err = erofs_init_percpu_workers();
+ if (err) {
+ erofs_err(sb, "per-cpu workers: failed to allocate.");
+ goto err_init_percpu_workers;
+ }
+
+ err = erofs_cpu_hotplug_init();
+ if (err < 0) {
+ erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
+ goto err_cpuhp_init;
+ }
+ erofs_info(sb, "initialized per-cpu workers successfully.");
+ return err;
+
+err_cpuhp_init:
+ erofs_destroy_percpu_workers();
+err_init_percpu_workers:
+ atomic_set(&erofs_percpu_workers_initialized, 0);
+ return err;
+}
+
+static void z_erofs_destroy_pcpu_workers(void)
{
+ if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
+ return;
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
+}
+#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
+static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
+static inline void z_erofs_destroy_pcpu_workers(void) {}
+#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */
+
+void z_erofs_exit_subsystem(void)
+{
+ z_erofs_destroy_pcpu_workers();
destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool();
+ z_erofs_crypto_disable_all_engines();
z_erofs_exit_decompressor();
}
@@ -438,19 +463,8 @@ int __init z_erofs_init_subsystem(void)
goto err_workqueue_init;
}
- err = erofs_init_percpu_workers();
- if (err)
- goto err_pcpu_worker;
-
- err = erofs_cpu_hotplug_init();
- if (err < 0)
- goto err_cpuhp_init;
return err;
-err_cpuhp_init:
- erofs_destroy_percpu_workers();
-err_pcpu_worker:
- destroy_workqueue(z_erofs_workqueue);
err_workqueue_init:
z_erofs_destroy_pcluster_pool();
err_pcluster_pool:
@@ -460,39 +474,32 @@ err_decompressor:
}
enum z_erofs_pclustermode {
+ /* It has previously been linked into another processing chain */
Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
- * could be dispatched into bypass queue later due to uptodated managed
- * pages. All related online pages cannot be reused for inplace I/O (or
- * bvpage) since it can be directly decoded without I/O submission.
+ * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
+ * may be dispatched to the bypass queue later due to uptodated managed
+ * folios. All file-backed folios related to this pcluster cannot be
+ * reused for in-place I/O (or bvpage) since the pcluster may be decoded
+ * in a separate queue (and thus out of order).
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
- * The pcluster was just linked to a decompression chain by us. It can
- * also be linked with the remaining pclusters, which means if the
- * processing page is the tail page of a pcluster, this pcluster can
- * safely use the whole page (since the previous pcluster is within the
- * same chain) for in-place I/O, as illustrated below:
- * ___________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (of the current pcl) | (of the previous pcl) |
- * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
- *
- * [ (*) the page above can be used as inplace I/O. ]
+ * The pcluster has just been linked to our processing chain.
+ * File-backed folios (except for the head page) related to it can be
+ * used for in-place I/O (or bvpage).
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
-struct z_erofs_decompress_frontend {
+struct z_erofs_frontend {
struct inode *const inode;
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
struct page *pagepool;
struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl;
- z_erofs_next_pcluster_t owned_head;
+ struct z_erofs_pcluster *pcl, *head;
enum z_erofs_pclustermode mode;
erofs_off_t headoffset;
@@ -501,11 +508,11 @@ struct z_erofs_decompress_frontend {
unsigned int icur;
};
-#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED }
+#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
+ .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
-static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
{
unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
@@ -522,19 +529,18 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
- bool standalone = true;
- /*
- * optimistic allocation without direct reclaim since inplace I/O
- * can be used if low memory otherwise.
- */
+ pgoff_t poff = pcl->pos >> PAGE_SHIFT;
+ bool may_bypass = true;
+ /* Optimistic allocation, as in-place I/O can be used as a fallback */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ struct folio *folio, *newfolio;
unsigned int i;
if (i_blocksize(fe->inode) != PAGE_SIZE ||
@@ -542,61 +548,54 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
return;
for (i = 0; i < pclusterpages; ++i) {
- struct page *page, *newpage;
-
/* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, pcl->obj.index + i);
- if (!page) {
- /* I/O is needed, no possible to decompress directly */
- standalone = false;
+ folio = filemap_get_folio(mc, poff + i);
+ if (IS_ERR(folio)) {
+ may_bypass = false;
if (!shouldalloc)
continue;
/*
- * Try cached I/O if allocation succeeds or fallback to
- * in-place I/O instead to avoid any direct reclaim.
+ * Allocate a managed folio for cached I/O, or it may be
+ * then filled with a file-backed folio for in-place I/O
*/
- newpage = erofs_allocpage(&fe->pagepool, gfp);
- if (!newpage)
+ newfolio = filemap_alloc_folio(gfp, 0);
+ if (!newfolio)
continue;
- set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+ newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
+ folio = NULL;
}
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
- pcl->compressed_bvecs[i].page = page ? page : newpage;
- spin_unlock(&pcl->obj.lockref.lock);
+ pcl->compressed_bvecs[i].page =
+ folio_page(folio ?: newfolio, 0);
+ spin_unlock(&pcl->lockref.lock);
continue;
}
- spin_unlock(&pcl->obj.lockref.lock);
-
- if (page)
- put_page(page);
- else if (newpage)
- erofs_pagepool_add(&fe->pagepool, newpage);
+ spin_unlock(&pcl->lockref.lock);
+ folio_put(folio ?: newfolio);
}
/*
- * don't do inplace I/O if all compressed pages are available in
- * managed cache since it can be moved to the bypass queue instead.
+ * Don't perform in-place I/O if all compressed pages are available in
+ * the managed cache, as the pcluster can be moved to the bypass queue.
*/
- if (standalone)
+ if (may_bypass)
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* (erofs_shrinker) disconnect cached encoded data with pclusters */
-int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
struct folio *folio;
int i;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ DBG_BUGON(pcl->from_meta);
/* Each cached folio contains one page unless bs > ps is supported */
for (i = 0; i < pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page) {
@@ -626,9 +625,9 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
return true;
ret = false;
- spin_lock(&pcl->obj.lockref.lock);
- if (pcl->obj.lockref.count <= 0) {
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ spin_lock(&pcl->lockref.lock);
+ if (pcl->lockref.count <= 0) {
+ DBG_BUGON(pcl->from_meta);
for (; bvec < end; ++bvec) {
if (bvec->page && page_folio(bvec->page) == folio) {
bvec->page = NULL;
@@ -638,7 +637,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
}
}
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return ret;
}
@@ -665,39 +664,49 @@ static const struct address_space_operations z_erofs_cache_aops = {
.invalidate_folio = z_erofs_cache_invalidate_folio,
};
-int erofs_init_managed_cache(struct super_block *sb)
+int z_erofs_init_super(struct super_block *sb)
{
- struct inode *const inode = new_inode(sb);
+ struct inode *inode;
+ int err;
+ err = z_erofs_init_pcpu_workers(sb);
+ if (err)
+ return err;
+
+ inode = new_inode(sb);
if (!inode)
return -ENOMEM;
-
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
+ xa_init(&EROFS_SB(sb)->managed_pslots);
return 0;
}
/* callers must be with pcluster lock held */
-static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+static int z_erofs_attach_page(struct z_erofs_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
- /* give priority for inplaceio to use file pages first */
- spin_lock(&pcl->obj.lockref.lock);
- while (fe->icur > 0) {
- if (pcl->compressed_bvecs[--fe->icur].page)
- continue;
- pcl->compressed_bvecs[fe->icur] = *bvec;
- spin_unlock(&pcl->obj.lockref.lock);
- return 0;
+ /* Inplace I/O is limited to one page for uncompressed data */
+ if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX ||
+ fe->icur <= 1) {
+ /* Try to prioritize inplace I/O here */
+ spin_lock(&pcl->lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->lockref.lock);
+ return 0;
+ }
+ spin_unlock(&pcl->lockref.lock);
}
- spin_unlock(&pcl->obj.lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@@ -710,35 +719,47 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
-static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
+static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
+{
+ if (lockref_get_not_zero(&pcl->lockref))
+ return true;
+
+ spin_lock(&pcl->lockref.lock);
+ if (__lockref_is_dead(&pcl->lockref)) {
+ spin_unlock(&pcl->lockref.lock);
+ return false;
+ }
+
+ if (!pcl->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&pcl->lockref.lock);
+ return true;
+}
+
+static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
- bool ztailpacking = map->m_flags & EROFS_MAP_META;
- struct z_erofs_pcluster *pcl;
- struct erofs_workgroup *grp;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct z_erofs_pcluster *pcl, *pre;
+ unsigned int pageofs_in;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED) ||
- (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- /* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(map->m_plen);
+ pageofs_in = erofs_blkoff(sb, map->m_pa);
+ pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- spin_lock_init(&pcl->obj.lockref.lock);
- pcl->obj.lockref.count = 1; /* one ref for this request */
+ lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
+ pcl->pclustersize = map->m_plen;
pcl->length = 0;
pcl->partial = true;
-
- /* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = fe->owned_head;
+ pcl->next = fe->head;
+ pcl->pos = map->m_pa;
+ pcl->pageofs_in = pageofs_in;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ pcl->from_meta = map->m_flags & EROFS_MAP_META;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
@@ -748,26 +769,29 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
mutex_init(&pcl->lock);
DBG_BUGON(!mutex_trylock(&pcl->lock));
- if (ztailpacking) {
- pcl->obj.index = 0; /* which indicates ztailpacking */
- } else {
- pcl->obj.index = erofs_blknr(sb, map->m_pa);
-
- grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
+ if (!pcl->from_meta) {
+ while (1) {
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos,
+ NULL, pcl, GFP_KERNEL);
+ if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
+ xa_unlock(&sbi->managed_pslots);
+ break;
+ }
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
}
-
- if (grp != &pcl->obj) {
- fe->pcl = container_of(grp,
- struct z_erofs_pcluster, obj);
+ if (xa_is_err(pre)) {
+ err = xa_err(pre);
+ goto err_out;
+ } else if (pre) {
+ fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
}
- fe->owned_head = &pcl->next;
- fe->pcl = pcl;
+ fe->head = fe->pcl = pcl;
return 0;
err_out:
@@ -776,27 +800,35 @@ err_out:
return err;
}
-static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
- erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
- struct erofs_workgroup *grp = NULL;
+ struct z_erofs_pcluster *pcl = NULL;
int ret;
DBG_BUGON(fe->pcl);
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(!fe->head);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(sb, blknr);
+ while (1) {
+ rcu_read_lock();
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa);
+ if (!pcl || z_erofs_get_pcluster(pcl)) {
+ DBG_BUGON(pcl && map->m_pa != pcl->pos);
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ }
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- if (grp) {
- fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl) {
+ fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@@ -805,10 +837,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
/* check if this pcluster hasn't been linked into any chain. */
- if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL,
- fe->owned_head) == Z_EROFS_PCLUSTER_NIL) {
+ if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
/* .. so it can be attached to our submission chain */
- fe->owned_head = &fe->pcl->next;
+ fe->head = fe->pcl;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
} else { /* otherwise, it belongs to an inflight chain */
fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
@@ -819,13 +850,13 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+ if (!fe->pcl->from_meta) {
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
void *mptr;
- mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
+ mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false);
if (IS_ERR(mptr)) {
ret = PTR_ERR(mptr);
erofs_err(sb, "failed to get inline data %d", ret);
@@ -841,25 +872,93 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
return 0;
}
-/*
- * keep in mind that no referenced pclusters will be freed
- * only after a RCU grace period.
- */
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- z_erofs_free_pcluster(container_of(head,
- struct z_erofs_pcluster, rcu));
+ z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
}
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl->lockref.count)
+ return false;
+
+ /*
+ * Note that all cached folios should be detached before deleted from
+ * the XArray. Otherwise some folios could be still attached to the
+ * orphan old pcluster when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_folios(sbi, pcl))
+ return false;
- call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ /*
+ * It's impossible to fail after the pcluster is freezed, but in order
+ * to avoid some race conditions, add a DBG_BUGON to observe this.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl);
+
+ lockref_mark_dead(&pcl->lockref);
+ return true;
}
-static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
+static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ bool free;
+
+ spin_lock(&pcl->lockref.lock);
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ spin_unlock(&pcl->lockref.lock);
+ if (free) {
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ }
+ return free;
+}
+
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
+{
+ struct z_erofs_pcluster *pcl;
+ unsigned long index, freed = 0;
+
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, pcl) {
+ /* try to shrink each valid pcluster */
+ if (!erofs_try_to_release_pcluster(sbi, pcl))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
+}
+
+static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl, bool try_free)
+{
+ bool free = false;
+
+ if (lockref_put_or_lock(&pcl->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&pcl->lockref));
+ if (!--pcl->lockref.count) {
+ if (try_free && xa_trylock(&sbi->managed_pslots)) {
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ xa_unlock(&sbi->managed_pslots);
+ }
+ atomic_long_add(!free, &erofs_global_shrink_cnt);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ if (free)
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+}
+
+static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -872,13 +971,9 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
if (fe->candidate_bvpage)
fe->candidate_bvpage = NULL;
- /*
- * if all pending pages are added, don't hold its reference
- * any longer if the pcluster isn't hosted by ourselves.
- */
+ /* Drop refcount if it doesn't belong to our processing chain */
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
- erofs_workgroup_put(&pcl->obj);
-
+ z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
}
@@ -896,7 +991,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, pos, EROFS_KMAP);
+ src = erofs_bread(&buf, pos, true);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
@@ -907,7 +1002,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
return 0;
}
-static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
+static int z_erofs_scan_folio(struct z_erofs_frontend *f,
struct folio *folio, bool ra)
{
struct inode *const inode = f->inode;
@@ -979,8 +1074,6 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
break;
erofs_onlinefolio_split(folio);
- if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- f->pcl->multibases = true;
if (f->pcl->length < offset + end - map->m_la) {
f->pcl->length = offset + end - map->m_la;
f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
@@ -1022,11 +1115,10 @@ static bool z_erofs_page_is_invalidated(struct page *page)
return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
}
-struct z_erofs_decompress_backend {
+struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
struct z_erofs_pcluster *pcl;
-
/* pages with the longest decompressed length for deduplication */
struct page **decompressed_pages;
/* pages to keep the compressed data */
@@ -1035,6 +1127,8 @@ struct z_erofs_decompress_backend {
struct list_head decompressed_secondary_bvecs;
struct page **pagepool;
unsigned int onstack_used, nr_pages;
+ /* indicate if temporary copies should be preserved for later use */
+ bool keepxcpy;
};
struct z_erofs_bvec_item {
@@ -1042,21 +1136,23 @@ struct z_erofs_bvec_item {
struct list_head list;
};
-static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
struct z_erofs_bvec *bvec)
{
+ int poff = bvec->offset + be->pcl->pageofs_out;
struct z_erofs_bvec_item *item;
- unsigned int pgnr;
-
- if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
- (bvec->end == PAGE_SIZE ||
- bvec->offset + bvec->end == be->pcl->length)) {
- pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
- DBG_BUGON(pgnr >= be->nr_pages);
- if (!be->decompressed_pages[pgnr]) {
- be->decompressed_pages[pgnr] = bvec->page;
+ struct page **page;
+
+ if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
+ DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages);
+ page = be->decompressed_pages + (poff >> PAGE_SHIFT);
+ if (!*page) {
+ *page = bvec->page;
return;
}
+ } else {
+ be->keepxcpy = true;
}
/* (cold path) one pcluster is requested multiple times */
@@ -1065,8 +1161,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
list_add(&item->list, &be->decompressed_secondary_bvecs);
}
-static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
- int err)
+static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
{
unsigned int off0 = be->pcl->pageofs_out;
struct list_head *p, *n;
@@ -1107,7 +1202,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
}
}
-static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
{
struct z_erofs_pcluster *pcl = be->pcl;
struct z_erofs_bvec_iter biter;
@@ -1132,8 +1227,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
}
-static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
- bool *overlapped)
+static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
{
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
@@ -1152,7 +1246,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
}
be->compressed_pages[i] = page;
- if (z_erofs_is_inline_pcluster(pcl) ||
+ if (pcl->from_meta ||
erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
@@ -1168,8 +1262,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
return err;
}
-static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
- int err)
+static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
{
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
@@ -1179,6 +1272,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
int i, j, jtop, err2;
struct page *page;
bool overlapped;
+ bool try_free = true;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1215,6 +1309,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
+ .inpages = pclusterpages,
+ .outpages = be->nr_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
.inputsize = pcl->pclustersize,
@@ -1222,13 +1318,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
- .fillgaps = pcl->multibases,
+ .fillgaps = be->keepxcpy,
.gfp = pcl->besteffort ? GFP_KERNEL :
GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
/* must handle all compressed pages before actual file pages */
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (pcl->from_meta) {
page = pcl->compressed_bvecs[0].page;
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
@@ -1236,9 +1332,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
page = be->compressed_pages[i];
- if (!page ||
- erofs_folio_is_managed(sbi, page_folio(page)))
+ if (!page)
+ continue;
+ if (erofs_folio_is_managed(sbi, page_folio(page))) {
+ try_free = false;
continue;
+ }
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@@ -1276,40 +1375,38 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
pcl->length = 0;
pcl->partial = true;
- pcl->multibases = false;
pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
/* pcluster lock MUST be taken before the following line */
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+ WRITE_ONCE(pcl->next, NULL);
mutex_unlock(&pcl->lock);
+
+ if (pcl->from_meta)
+ z_erofs_free_pcluster(pcl);
+ else
+ z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
struct page **pagepool)
{
- struct z_erofs_decompress_backend be = {
+ struct z_erofs_backend be = {
.sb = io->sb,
.pagepool = pagepool,
.decompressed_secondary_bvecs =
LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ .pcl = io->head,
};
- z_erofs_next_pcluster_t owned = io->head;
+ struct z_erofs_pcluster *next;
int err = io->eio ? -EIO : 0;
- while (owned != Z_EROFS_PCLUSTER_TAIL) {
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
-
- be.pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(be.pcl->next);
-
+ for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
+ DBG_BUGON(!be.pcl);
+ next = READ_ONCE(be.pcl->next);
err = z_erofs_decompress_pcluster(&be, err) ?: err;
- if (z_erofs_is_inline_pcluster(be.pcl))
- z_erofs_free_pcluster(be.pcl);
- else
- erofs_workgroup_put(&be.pcl->obj);
}
return err;
}
@@ -1374,7 +1471,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
}
static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
- struct z_erofs_decompress_frontend *f,
+ struct z_erofs_frontend *f,
struct z_erofs_pcluster *pcl,
unsigned int nr,
struct address_space *mc)
@@ -1391,9 +1488,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
zbv = pcl->compressed_bvecs[nr];
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
if (!zbv.page)
goto out_allocfolio;
@@ -1401,12 +1498,8 @@ repeat:
DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
folio = page_folio(zbv.page);
- /*
- * Handle preallocated cached folios. We tried to allocate such folios
- * without triggering direct reclaim. If allocation failed, inplace
- * file-backed folios will be used instead.
- */
- if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
+ /* For preallocated managed folios, add them to page cache here */
+ if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) {
tocache = true;
goto out_tocache;
}
@@ -1455,23 +1548,23 @@ repeat:
folio_put(folio);
out_allocfolio:
page = __erofs_allocpage(&f->pagepool, gfp, true);
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
if (page)
erofs_pagepool_add(&f->pagepool, page);
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
bvec->bv_page = page;
if (!page)
return;
folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+ filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
@@ -1518,18 +1611,13 @@ enum {
NR_JOBQUEUES,
};
-static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
- z_erofs_next_pcluster_t qtail[],
- z_erofs_next_pcluster_t owned_head)
+static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
+ struct z_erofs_pcluster *next,
+ struct z_erofs_pcluster **qtail[])
{
- z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
- z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
-
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
-
- WRITE_ONCE(*submit_qtail, owned_head);
- WRITE_ONCE(*bypass_qtail, &pcl->next);
-
+ WRITE_ONCE(*qtail[JQ_SUBMIT], next);
+ WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
qtail[JQ_BYPASS] = &pcl->next;
}
@@ -1558,15 +1646,15 @@ static void z_erofs_endio(struct bio *bio)
bio_put(bio);
}
-static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
+static void z_erofs_submit_queue(struct z_erofs_frontend *f,
struct z_erofs_decompressqueue *fgq,
bool *force_fg, bool readahead)
{
struct super_block *sb = f->inode->i_sb;
struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
- z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+ struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
- z_erofs_next_pcluster_t owned_head = f->owned_head;
+ struct z_erofs_pcluster *pcl, *next;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
erofs_off_t last_pa;
unsigned int nr_bios = 0;
@@ -1582,33 +1670,31 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
/* by default, all need io submission */
- q[JQ_SUBMIT]->head = owned_head;
+ q[JQ_SUBMIT]->head = next = f->head;
do {
struct erofs_map_dev mdev;
- struct z_erofs_pcluster *pcl;
erofs_off_t cur, end;
struct bio_vec bvec;
unsigned int i = 0;
bool bypass = true;
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
- pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- owned_head = READ_ONCE(pcl->next);
-
- if (z_erofs_is_inline_pcluster(pcl)) {
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ pcl = next;
+ next = READ_ONCE(pcl->next);
+ if (pcl->from_meta) {
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
continue;
}
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->obj.index),
+ .m_pa = round_down(pcl->pos, sb->s_blocksize),
};
(void)erofs_map_dev(sb, &mdev);
cur = mdev.m_pa;
- end = cur + pcl->pclustersize;
+ end = round_up(cur + pcl->pageofs_in + pcl->pclustersize,
+ sb->s_blocksize);
do {
bvec.bv_page = NULL;
if (bio && (cur != last_pa ||
@@ -1652,7 +1738,8 @@ drain_io:
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_endio;
- bio->bi_iter.bi_sector = cur >> 9;
+ bio->bi_iter.bi_sector =
+ (mdev.m_dif->fsoff + cur) >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
@@ -1669,8 +1756,8 @@ drain_io:
if (!bypass)
qtail[JQ_SUBMIT] = &pcl->next;
else
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
- } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
+ } while (next != Z_EROFS_PCLUSTER_TAIL);
if (bio) {
if (erofs_is_fileio_mode(EROFS_SB(sb)))
@@ -1679,9 +1766,9 @@ drain_io:
erofs_fscache_submit_bio(bio);
else
submit_bio(bio);
- if (memstall)
- psi_memstall_leave(&pflags);
}
+ if (memstall)
+ psi_memstall_leave(&pflags);
/*
* although background is preferred, no one is pending for submission.
@@ -1694,17 +1781,16 @@ drain_io:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
}
-static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- unsigned int ra_folios)
+static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
- bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios);
+ bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
int err;
- if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->head == Z_EROFS_PCLUSTER_TAIL)
return 0;
- z_erofs_submit_queue(f, io, &force_fg, !!ra_folios);
+ z_erofs_submit_queue(f, io, &force_fg, !!rapages);
/* handle bypass queue (no i/o pclusters) immediately */
err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
@@ -1722,7 +1808,7 @@ static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
* Since partial uptodate is still unimplemented for now, we have to use
* approximate readmore strategies as a start.
*/
-static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
@@ -1777,12 +1863,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
int err;
trace_erofs_read_folio(folio, false);
- f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
-
z_erofs_pcluster_readmore(&f, NULL, true);
err = z_erofs_scan_folio(&f, folio, false);
z_erofs_pcluster_readmore(&f, NULL, false);
@@ -1802,17 +1886,13 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
+ unsigned int nrpages = readahead_count(rac);
struct folio *head = NULL, *folio;
- unsigned int nr_folios;
int err;
- f.headoffset = readahead_pos(rac);
-
+ trace_erofs_readahead(inode, readahead_index(rac), nrpages, false);
z_erofs_pcluster_readmore(&f, rac, true);
- nr_folios = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
-
while ((folio = readahead_folio(rac))) {
folio->private = head;
head = folio;
@@ -1831,7 +1911,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
z_erofs_pcluster_readmore(&f, rac, false);
z_erofs_pcluster_end(&f);
- (void)z_erofs_runqueue(&f, nr_folios);
+ (void)z_erofs_runqueue(&f, nrpages);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a076cca1f547..0bebc6e3a4d7 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -25,13 +25,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) +
+ const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) +
vi->inode_isize + vi->xattr_isize) +
lcn * sizeof(struct z_erofs_lcluster_index);
struct z_erofs_lcluster_index *di;
unsigned int advise;
- di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true);
if (IS_ERR(di))
return PTR_ERR(di);
m->lcn = lcn;
@@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
advise = le16_to_cpu(di->di_advise);
m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->clusterofs = 1 << vi->z_lclusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
@@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
} else {
m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
- if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
+ if (m->clusterofs >= 1 << vi->z_lclusterbits) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -97,17 +97,48 @@ static int get_compacted_la_distance(unsigned int lobits,
return d1;
}
-static int unpack_compacted_index(struct z_erofs_maprecorder *m,
- unsigned int amortizedshift,
- erofs_off_t pos, bool lookahead)
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn, bool lookahead)
{
- struct erofs_inode *const vi = EROFS_I(m->inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize);
+ const unsigned int lclusterbits = vi->z_lclusterbits;
+ const unsigned int totalidx = erofs_iblks(inode);
+ unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
- bool big_pcluster;
+ bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ erofs_off_t pos;
u8 *in, type;
int i;
+ if (lcn >= totalidx || lclusterbits > 14)
+ return -EINVAL;
+
+ m->lcn = lcn;
+ /* used to align to 32-byte (compacted_2b) alignment */
+ compacted_4b_initial = ((32 - ebase % 32) / 4) & 7;
+ compacted_2b = 0;
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
+ compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+
+ pos = ebase;
+ amortizedshift = 2; /* compact_4b */
+ if (lcn >= compacted_4b_initial) {
+ pos += compacted_4b_initial * 4;
+ lcn -= compacted_4b_initial;
+ if (lcn < compacted_2b) {
+ amortizedshift = 1;
+ } else {
+ pos += compacted_2b * 2;
+ lcn -= compacted_2b;
+ }
+ }
+ pos += lcn * (1 << amortizedshift);
+
+ /* figure out the lcluster count in this pack */
if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
else if (1 << amortizedshift == 2 && lclusterbits <= 12)
@@ -115,14 +146,13 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
- in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP);
+ in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true);
if (IS_ERR(in))
return PTR_ERR(in);
/* it doesn't equal to round_up(..) */
m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
(vcnt << amortizedshift);
- big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
bytes = pos & ((vcnt << amortizedshift) - 1);
@@ -207,53 +237,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
return 0;
}
-static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
- unsigned long lcn, bool lookahead)
-{
- struct inode *const inode = m->inode;
- struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
- ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- unsigned int totalidx = erofs_iblks(inode);
- unsigned int compacted_4b_initial, compacted_2b;
- unsigned int amortizedshift;
- erofs_off_t pos;
-
- if (lcn >= totalidx)
- return -EINVAL;
-
- m->lcn = lcn;
- /* used to align to 32-byte (compacted_2b) alignment */
- compacted_4b_initial = (32 - ebase % 32) / 4;
- if (compacted_4b_initial == 32 / 4)
- compacted_4b_initial = 0;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
- compacted_4b_initial < totalidx)
- compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
- else
- compacted_2b = 0;
-
- pos = ebase;
- if (lcn < compacted_4b_initial) {
- amortizedshift = 2;
- goto out;
- }
- pos += compacted_4b_initial * 4;
- lcn -= compacted_4b_initial;
-
- if (lcn < compacted_2b) {
- amortizedshift = 1;
- goto out;
- }
- pos += compacted_2b * 2;
- lcn -= compacted_2b;
- amortizedshift = 2;
-out:
- pos += lcn * (1 << amortizedshift);
- return unpack_compacted_index(m, amortizedshift, pos, lookahead);
-}
-
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
@@ -272,7 +255,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
{
struct super_block *sb = m->inode->i_sb;
struct erofs_inode *const vi = EROFS_I(m->inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const unsigned int lclusterbits = vi->z_lclusterbits;
while (m->lcn >= lookback_distance) {
unsigned long lcn = m->lcn - lookback_distance;
@@ -282,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
if (err)
return err;
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
if (!lookback_distance)
- goto err_bogus;
+ break;
continue;
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ } else {
m->headtype = m->type;
m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
return 0;
- default:
- erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
}
-err_bogus:
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
DBG_BUGON(1);
@@ -311,27 +290,23 @@ err_bogus:
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
unsigned int initial_lcn)
{
- struct super_block *sb = m->inode->i_sb;
- struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn;
+ struct inode *inode = m->inode;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2;
+ unsigned long lcn = m->lcn + 1;
int err;
- DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 &&
- m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
DBG_BUGON(m->type != m->headtype);
- if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
- ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1ULL << lclusterbits;
- return 0;
- }
- lcn = m->lcn + 1;
+ if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
+ ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+ m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
+ (lcn << vi->z_lclusterbits) >= inode->i_size)
+ m->compressedblks = 1;
+
if (m->compressedblks)
goto out;
@@ -350,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
DBG_BUGON(lcn == initial_lcn &&
m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ if (m->compressedblks)
+ goto out;
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+ * rather than CBLKCNT, it's a 1 block-sized pcluster.
*/
- m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits);
- break;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
- if (m->delta[0] != 1)
- goto err_bonus_cblkcnt;
- if (m->compressedblks)
- break;
- fallthrough;
- default:
- erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn,
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ m->compressedblks = 1;
+ goto out;
}
-out:
- map->m_plen = erofs_pos(sb, m->compressedblks);
- return 0;
-err_bonus_cblkcnt:
- erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
+out:
+ m->map->m_plen = erofs_pos(sb, m->compressedblks);
+ return 0;
}
static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
@@ -386,11 +354,11 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
struct inode *inode = m->inode;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_map_blocks *map = m->map;
- unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned int lclusterbits = vi->z_lclusterbits;
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
- do {
+ while (1) {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
@@ -402,14 +370,14 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return err;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- DBG_BUGON(!m->delta[1] &&
- m->clusterofs != 1 << lclusterbits);
- } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- /* go on until the next HEAD lcluster */
+ /* work around invalid d1 generated by pre-1.0 mkfs */
+ if (unlikely(!m->delta[1])) {
+ m->delta[1] = 1;
+ DBG_BUGON(1);
+ }
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
if (lcn != headlcn)
- break;
+ break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
} else {
erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
@@ -418,29 +386,37 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return -EOPNOTSUPP;
}
lcn += m->delta[1];
- } while (m->delta[1]);
-
+ }
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}
-static int z_erofs_do_map_blocks(struct inode *inode,
+static int z_erofs_map_blocks_fo(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
- struct erofs_inode *const vi = EROFS_I(inode);
- bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ bool ztailpacking = vi->z_idata_size;
+ unsigned int lclusterbits = vi->z_lclusterbits;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
};
int err = 0;
- unsigned int lclusterbits, endoff, afmt;
+ unsigned int endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
- lclusterbits = vi->z_logical_clusterbits;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
+ if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+ return 0;
+ }
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
@@ -448,9 +424,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (err)
goto unmap_out;
- if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
- vi->z_idataoff = m.nextpackoff;
-
+ if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking)
+ vi->z_fragmentoff = m.nextpackoff;
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
@@ -472,8 +447,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
/* m.lcn should be >= 1 if endoff < m.clusterofs */
if (!m.lcn) {
- erofs_err(inode->i_sb,
- "invalid logical cluster 0 at nid %llu",
+ erofs_err(sb, "invalid logical cluster 0 at nid %llu",
vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -489,8 +463,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
goto unmap_out;
break;
default:
- erofs_err(inode->i_sb,
- "unknown type %u @ offset %llu of nid %llu",
+ erofs_err(sb, "unknown type %u @ offset %llu of nid %llu",
m.type, ofs, vi->nid);
err = -EOPNOTSUPP;
goto unmap_out;
@@ -507,12 +480,18 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
- map->m_pa = vi->z_idataoff;
+ map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
+ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map->m_plen);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_FRAGMENT;
} else {
- map->m_pa = erofs_pos(inode->i_sb, m.pblk);
+ map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto unmap_out;
@@ -531,7 +510,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
afmt, vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -555,6 +534,114 @@ unmap_out:
return err;
}
+static int z_erofs_map_blocks_ext(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER;
+ unsigned int recsz = z_erofs_extent_recsize(vi->z_advise);
+ erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize), recsz);
+ erofs_off_t lend = inode->i_size;
+ erofs_off_t l, r, mid, pa, la, lstart;
+ struct z_erofs_extent *ext;
+ unsigned int fmt;
+ bool last;
+
+ map->m_flags = 0;
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) {
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ pa = le64_to_cpu(*(__le64 *)ext);
+ pos += sizeof(__le64);
+ lstart = 0;
+ } else {
+ lstart = round_down(map->m_la, 1 << vi->z_lclusterbits);
+ pos += (lstart >> vi->z_lclusterbits) * recsz;
+ pa = EROFS_NULL_ADDR;
+ }
+
+ for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ map->m_plen = le32_to_cpu(ext->plen);
+ if (pa != EROFS_NULL_ADDR) {
+ map->m_pa = pa;
+ pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK;
+ } else {
+ map->m_pa = le32_to_cpu(ext->pstart_lo);
+ }
+ pos += recsz;
+ }
+ last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits));
+ lend = min(lstart, lend);
+ lstart -= 1 << vi->z_lclusterbits;
+ } else {
+ lstart = lend;
+ for (l = 0, r = vi->z_extents; l < r; ) {
+ mid = l + (r - l) / 2;
+ ext = erofs_read_metabuf(&map->buf, sb,
+ pos + mid * recsz, true);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+
+ la = le32_to_cpu(ext->lstart_lo);
+ pa = le32_to_cpu(ext->pstart_lo) |
+ (u64)le32_to_cpu(ext->pstart_hi) << 32;
+ if (recsz > offsetof(struct z_erofs_extent, lstart_hi))
+ la |= (u64)le32_to_cpu(ext->lstart_hi) << 32;
+
+ if (la > map->m_la) {
+ r = mid;
+ if (la > lend) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lend = la;
+ } else {
+ l = mid + 1;
+ if (map->m_la == la)
+ r = min(l + 1, r);
+ lstart = la;
+ map->m_plen = le32_to_cpu(ext->plen);
+ map->m_pa = pa;
+ }
+ }
+ last = (l >= vi->z_extents);
+ }
+
+ if (lstart < lend) {
+ map->m_la = lstart;
+ if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
+ map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
+ vi->z_fragmentoff = map->m_plen;
+ if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
+ vi->z_fragmentoff |= map->m_pa << 32;
+ } else if (map->m_plen) {
+ map->m_flags |= EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
+ fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
+ if (fmt)
+ map->m_algorithmformat = fmt - 1;
+ else if (interlaced && !erofs_blkoff(sb, map->m_pa))
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
+ map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK;
+ }
+ }
+ map->m_llen = lend - map->m_la;
+ return 0;
+}
+
static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
@@ -581,7 +668,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
+ h = erofs_read_metabuf(&buf, sb, pos, true);
if (IS_ERR(h)) {
err = PTR_ERR(h);
goto out_unlock;
@@ -598,8 +685,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto done;
}
vi->z_advise = le16_to_cpu(h->h_advise);
+ vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15);
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) {
+ vi->z_extents = le32_to_cpu(h->h_extents_lo) |
+ ((u64)le16_to_cpu(h->h_extents_hi) << 32);
+ goto done;
+ }
+
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
headnr = 0;
if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
@@ -610,7 +709,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_put_metabuf;
}
- vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7);
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -628,34 +726,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_put_metabuf;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
+ if (vi->z_idata_size ||
+ (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
struct erofs_map_blocks map = {
.buf = __EROFS_BUF_INITIALIZER
};
- vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- err = z_erofs_do_map_blocks(inode, &map,
- EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
-
- if (!map.m_plen ||
- erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map.m_plen);
- err = -EFSCORRUPTED;
- }
- if (err < 0)
- goto out_put_metabuf;
- }
-
- if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
- !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER
- };
-
- vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
- err = z_erofs_do_map_blocks(inode, &map,
+ err = z_erofs_map_blocks_fo(inode, &map,
EROFS_GET_BLOCKS_FINDTAIL);
erofs_put_metabuf(&map.buf);
if (err < 0)
@@ -686,15 +763,11 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
} else {
err = z_erofs_fill_inode_lazy(inode);
if (!err) {
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED |
- EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
- } else {
- err = z_erofs_do_map_blocks(inode, map, flags);
- }
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS))
+ err = z_erofs_map_blocks_ext(inode, map, flags);
+ else
+ err = z_erofs_map_blocks_fo(inode, map, flags);
}
if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 37afe2024840..55ff2ab5128e 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "internal.h"
@@ -19,13 +20,12 @@ static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
-static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
-/* protected by 'erofs_sb_list_lock' */
-static unsigned int shrinker_run_no;
+atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
-/* protects the mounted 'erofs_sb_list' */
+/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
static DEFINE_SPINLOCK(erofs_sb_list_lock);
static LIST_HEAD(erofs_sb_list);
+static unsigned int shrinker_run_no;
static struct shrinker *erofs_shrinker_info;
static unsigned int z_erofs_gbuf_id(void)
@@ -87,8 +87,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages)
tmp_pages[j] = gbuf->pages[j];
do {
last = j;
- j = alloc_pages_bulk_array(GFP_KERNEL, nrpages,
- tmp_pages);
+ j = alloc_pages_bulk(GFP_KERNEL, nrpages,
+ tmp_pages);
if (last == j)
goto out;
} while (j != nrpages);
@@ -214,145 +214,6 @@ void erofs_release_pages(struct page **pagepool)
}
}
-static bool erofs_workgroup_get(struct erofs_workgroup *grp)
-{
- if (lockref_get_not_zero(&grp->lockref))
- return true;
-
- spin_lock(&grp->lockref.lock);
- if (__lockref_is_dead(&grp->lockref)) {
- spin_unlock(&grp->lockref.lock);
- return false;
- }
-
- if (!grp->lockref.count++)
- atomic_long_dec(&erofs_global_shrink_cnt);
- spin_unlock(&grp->lockref.lock);
- return true;
-}
-
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_workgroup *grp;
-
-repeat:
- rcu_read_lock();
- grp = xa_load(&sbi->managed_pslots, index);
- if (grp) {
- if (!erofs_workgroup_get(grp)) {
- /* prefer to relax rcu read side */
- rcu_read_unlock();
- goto repeat;
- }
-
- DBG_BUGON(index != grp->index);
- }
- rcu_read_unlock();
- return grp;
-}
-
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct erofs_workgroup *pre;
-
- DBG_BUGON(grp->lockref.count < 1);
-repeat:
- xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_KERNEL);
- if (pre) {
- if (xa_is_err(pre)) {
- pre = ERR_PTR(xa_err(pre));
- } else if (!erofs_workgroup_get(pre)) {
- /* try to legitimize the current in-tree one */
- xa_unlock(&sbi->managed_pslots);
- cond_resched();
- goto repeat;
- }
- grp = pre;
- }
- xa_unlock(&sbi->managed_pslots);
- return grp;
-}
-
-static void __erofs_workgroup_free(struct erofs_workgroup *grp)
-{
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
-}
-
-void erofs_workgroup_put(struct erofs_workgroup *grp)
-{
- if (lockref_put_or_lock(&grp->lockref))
- return;
-
- DBG_BUGON(__lockref_is_dead(&grp->lockref));
- if (grp->lockref.count == 1)
- atomic_long_inc(&erofs_global_shrink_cnt);
- --grp->lockref.count;
- spin_unlock(&grp->lockref.lock);
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
-{
- int free = false;
-
- spin_lock(&grp->lockref.lock);
- if (grp->lockref.count)
- goto out;
-
- /*
- * Note that all cached pages should be detached before deleted from
- * the XArray. Otherwise some cached pages could be still attached to
- * the orphan old workgroup when the new one is available in the tree.
- */
- if (erofs_try_to_free_all_cached_folios(sbi, grp))
- goto out;
-
- /*
- * It's impossible to fail after the workgroup is freezed,
- * however in order to avoid some race conditions, add a
- * DBG_BUGON to observe this in advance.
- */
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
-
- lockref_mark_dead(&grp->lockref);
- free = true;
-out:
- spin_unlock(&grp->lockref.lock);
- if (free)
- __erofs_workgroup_free(grp);
- return free;
-}
-
-static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink)
-{
- struct erofs_workgroup *grp;
- unsigned int freed = 0;
- unsigned long index;
-
- xa_lock(&sbi->managed_pslots);
- xa_for_each(&sbi->managed_pslots, index, grp) {
- /* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp))
- continue;
- xa_unlock(&sbi->managed_pslots);
-
- ++freed;
- if (!--nr_shrink)
- return freed;
- xa_lock(&sbi->managed_pslots);
- }
- xa_unlock(&sbi->managed_pslots);
- return freed;
-}
-
void erofs_shrinker_register(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -369,9 +230,10 @@ void erofs_shrinker_unregister(struct super_block *sb)
struct erofs_sb_info *const sbi = EROFS_SB(sb);
mutex_lock(&sbi->umount_mutex);
- /* clean up all remaining workgroups in memory */
- erofs_shrink_workstation(sbi, ~0UL);
-
+ while (!xa_empty(&sbi->managed_pslots)) {
+ z_erofs_shrink_scan(sbi, ~0UL);
+ cond_resched();
+ }
spin_lock(&erofs_sb_list_lock);
list_del(&sbi->list);
spin_unlock(&erofs_sb_list_lock);
@@ -381,7 +243,7 @@ void erofs_shrinker_unregister(struct super_block *sb)
static unsigned long erofs_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- return atomic_long_read(&erofs_global_shrink_cnt);
+ return atomic_long_read(&erofs_global_shrink_cnt) ?: SHRINK_EMPTY;
}
static unsigned long erofs_shrink_scan(struct shrinker *shrink,
@@ -418,9 +280,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
-
- freed += erofs_shrink_workstation(sbi, nr - freed);
-
+ freed += z_erofs_shrink_scan(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
p = p->next;